blob: 340f8ccc38752ad84e1868abd3c3111ec61fa701 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Serhiy Storchakae822b032013-08-06 16:56:26 +0300550/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551 * by 'ptr', possibly combining surrogate pairs on narrow builds.
552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553 * that should be returned and 'end' pointing to the end of the buffer.
554 * ('end' is used on narrow builds to detect a lone surrogate at the
555 * end of the buffer that should be returned unchanged.)
556 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557 * The type of the returned char is always Py_UCS4.
558 *
559 * Note: the macro advances ptr to next char, so it might have side-effects
560 * (especially if used with other macros).
561 */
562
563/* helper macros used by _Py_UNICODE_NEXT */
564#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566/* Join two surrogate characters and return a single Py_UCS4 value. */
567#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
570
571#ifdef Py_UNICODE_WIDE
572#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
573#else
574#define _Py_UNICODE_NEXT(ptr, end) \
575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
578 (Py_UCS4)*(ptr)++)
579#endif
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581#ifdef HAVE_WCHAR_H
582
Mark Dickinson6b265f12009-03-18 16:07:26 +0000583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594{
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000643 PyErr_BadInternalCall();
644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 }
646
647 unicode = _PyUnicode_New(size);
648 if (!unicode)
649 return NULL;
650
651 /* Copy the wchar_t data into the new object */
652#ifdef HAVE_USABLE_WCHAR_T
653 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000654#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000656 register Py_UNICODE *u;
657 register Py_ssize_t i;
658 u = PyUnicode_AS_UNICODE(unicode);
659 for (i = size; i > 0; i--)
660 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662#endif
663
664 return (PyObject *)unicode;
665}
666
Mark Dickinson6b265f12009-03-18 16:07:26 +0000667#endif /* CONVERT_WCHAR_TO_SURROGATES */
668
669#undef CONVERT_WCHAR_TO_SURROGATES
670
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000671static void
672makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
673{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000674 *fmt++ = '%';
675 if (width) {
676 if (zeropad)
677 *fmt++ = '0';
678 fmt += sprintf(fmt, "%d", width);
679 }
680 if (precision)
681 fmt += sprintf(fmt, ".%d", precision);
682 if (longflag)
683 *fmt++ = 'l';
684 else if (size_tflag) {
685 char *f = PY_FORMAT_SIZE_T;
686 while (*f)
687 *fmt++ = *f++;
688 }
689 *fmt++ = c;
690 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000691}
692
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200693#define appendstring(string) \
694 do { \
695 for (copy = string;*copy; copy++) { \
696 *s++ = (unsigned char)*copy; \
697 } \
698 } while (0)
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000699
700PyObject *
701PyUnicode_FromFormatV(const char *format, va_list vargs)
702{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000703 va_list count;
704 Py_ssize_t callcount = 0;
705 PyObject **callresults = NULL;
706 PyObject **callresult = NULL;
707 Py_ssize_t n = 0;
708 int width = 0;
709 int precision = 0;
710 int zeropad;
711 const char* f;
712 Py_UNICODE *s;
713 PyObject *string;
714 /* used by sprintf */
715 char buffer[21];
716 /* use abuffer instead of buffer, if we need more space
717 * (which can happen if there's a format specifier with width). */
718 char *abuffer = NULL;
719 char *realbuffer;
720 Py_ssize_t abuffersize = 0;
721 char fmt[60]; /* should be enough for %0width.precisionld */
722 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000723
724#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726#else
727#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000728 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000729#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000730 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000731#endif
732#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000733 /* step 1: count the number of %S/%R/%s format specifications
734 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
735 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000736 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000737 if (*f == '%') {
738 if (*(f+1)=='%')
739 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000740 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000741 ++callcount;
742 while (isdigit((unsigned)*f))
743 width = (width*10) + *f++ - '0';
744 while (*++f && *f != '%' && !isalpha((unsigned)*f))
745 ;
746 if (*f == 's')
747 ++callcount;
748 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000749 }
750 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000751 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000752 if (callcount) {
753 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
754 if (!callresults) {
755 PyErr_NoMemory();
756 return NULL;
757 }
758 callresult = callresults;
759 }
760 /* step 3: figure out how large a buffer we need */
761 for (f = format; *f; f++) {
762 if (*f == '%') {
763 const char* p = f;
764 width = 0;
765 while (isdigit((unsigned)*f))
766 width = (width*10) + *f++ - '0';
767 while (*++f && *f != '%' && !isalpha((unsigned)*f))
768 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000769
Benjamin Peterson857ce152009-01-31 16:29:18 +0000770 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
771 * they don't affect the amount of space we reserve.
772 */
773 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000774 (f[1] == 'd' || f[1] == 'u'))
775 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000776
Benjamin Peterson857ce152009-01-31 16:29:18 +0000777 switch (*f) {
778 case 'c':
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300779 {
780 int ordinal = va_arg(count, int);
781#ifdef Py_UNICODE_WIDE
782 if (ordinal < 0 || ordinal > 0x10ffff) {
783 PyErr_SetString(PyExc_OverflowError,
784 "%c arg not in range(0x110000) "
785 "(wide Python build)");
786 goto fail;
787 }
788#else
789 if (ordinal < 0 || ordinal > 0xffff) {
790 PyErr_SetString(PyExc_OverflowError,
791 "%c arg not in range(0x10000) "
792 "(narrow Python build)");
793 goto fail;
794 }
795#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000796 /* fall through... */
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300797 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000798 case '%':
799 n++;
800 break;
801 case 'd': case 'u': case 'i': case 'x':
802 (void) va_arg(count, int);
803 /* 20 bytes is enough to hold a 64-bit
804 integer. Decimal takes the most space.
805 This isn't enough for octal.
806 If a width is specified we need more
807 (which we allocate later). */
808 if (width < 20)
809 width = 20;
810 n += width;
811 if (abuffersize < width)
812 abuffersize = width;
813 break;
814 case 's':
815 {
816 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000817 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000818 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
819 if (!str)
820 goto fail;
821 n += PyUnicode_GET_SIZE(str);
822 /* Remember the str and switch to the next slot */
823 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000824 break;
825 }
826 case 'U':
827 {
828 PyObject *obj = va_arg(count, PyObject *);
829 assert(obj && PyUnicode_Check(obj));
830 n += PyUnicode_GET_SIZE(obj);
831 break;
832 }
833 case 'V':
834 {
835 PyObject *obj = va_arg(count, PyObject *);
836 const char *str = va_arg(count, const char *);
837 assert(obj || str);
838 assert(!obj || PyUnicode_Check(obj));
839 if (obj)
840 n += PyUnicode_GET_SIZE(obj);
841 else
842 n += strlen(str);
843 break;
844 }
845 case 'S':
846 {
847 PyObject *obj = va_arg(count, PyObject *);
848 PyObject *str;
849 assert(obj);
850 str = PyObject_Str(obj);
851 if (!str)
852 goto fail;
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200853 n += PyString_GET_SIZE(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000854 /* Remember the str and switch to the next slot */
855 *callresult++ = str;
856 break;
857 }
858 case 'R':
859 {
860 PyObject *obj = va_arg(count, PyObject *);
861 PyObject *repr;
862 assert(obj);
863 repr = PyObject_Repr(obj);
864 if (!repr)
865 goto fail;
866 n += PyUnicode_GET_SIZE(repr);
867 /* Remember the repr and switch to the next slot */
868 *callresult++ = repr;
869 break;
870 }
871 case 'p':
872 (void) va_arg(count, int);
873 /* maximum 64-bit pointer representation:
874 * 0xffffffffffffffff
875 * so 19 characters is enough.
876 * XXX I count 18 -- what's the extra for?
877 */
878 n += 19;
879 break;
880 default:
881 /* if we stumble upon an unknown
882 formatting code, copy the rest of
883 the format string to the output
884 string. (we cannot just skip the
885 code, since there's no way to know
886 what's in the argument list) */
887 n += strlen(p);
888 goto expand;
889 }
890 } else
891 n++;
892 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000893 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000894 if (abuffersize > 20) {
895 abuffer = PyObject_Malloc(abuffersize);
896 if (!abuffer) {
897 PyErr_NoMemory();
898 goto fail;
899 }
900 realbuffer = abuffer;
901 }
902 else
903 realbuffer = buffer;
904 /* step 4: fill the buffer */
905 /* Since we've analyzed how much space we need for the worst case,
906 we don't have to resize the string.
907 There can be no errors beyond this point. */
908 string = PyUnicode_FromUnicode(NULL, n);
909 if (!string)
910 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000911
Benjamin Peterson857ce152009-01-31 16:29:18 +0000912 s = PyUnicode_AS_UNICODE(string);
913 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000914
Benjamin Peterson857ce152009-01-31 16:29:18 +0000915 for (f = format; *f; f++) {
916 if (*f == '%') {
917 const char* p = f++;
918 int longflag = 0;
919 int size_tflag = 0;
920 zeropad = (*f == '0');
921 /* parse the width.precision part */
922 width = 0;
923 while (isdigit((unsigned)*f))
924 width = (width*10) + *f++ - '0';
925 precision = 0;
926 if (*f == '.') {
927 f++;
928 while (isdigit((unsigned)*f))
929 precision = (precision*10) + *f++ - '0';
930 }
931 /* handle the long flag, but only for %ld and %lu.
932 others can be added when necessary. */
933 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
934 longflag = 1;
935 ++f;
936 }
937 /* handle the size_t flag. */
938 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
939 size_tflag = 1;
940 ++f;
941 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000942
Benjamin Peterson857ce152009-01-31 16:29:18 +0000943 switch (*f) {
944 case 'c':
945 *s++ = va_arg(vargs, int);
946 break;
947 case 'd':
948 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
949 if (longflag)
950 sprintf(realbuffer, fmt, va_arg(vargs, long));
951 else if (size_tflag)
952 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
953 else
954 sprintf(realbuffer, fmt, va_arg(vargs, int));
955 appendstring(realbuffer);
956 break;
957 case 'u':
958 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
959 if (longflag)
960 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
961 else if (size_tflag)
962 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
963 else
964 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
965 appendstring(realbuffer);
966 break;
967 case 'i':
968 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
969 sprintf(realbuffer, fmt, va_arg(vargs, int));
970 appendstring(realbuffer);
971 break;
972 case 'x':
973 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
974 sprintf(realbuffer, fmt, va_arg(vargs, int));
975 appendstring(realbuffer);
976 break;
977 case 's':
978 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000979 /* unused, since we already have the result */
980 (void) va_arg(vargs, char *);
981 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
982 PyUnicode_GET_SIZE(*callresult));
983 s += PyUnicode_GET_SIZE(*callresult);
984 /* We're done with the unicode()/repr() => forget it */
985 Py_DECREF(*callresult);
986 /* switch to next unicode()/repr() result */
987 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000988 break;
989 }
990 case 'U':
991 {
992 PyObject *obj = va_arg(vargs, PyObject *);
993 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
994 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
995 s += size;
996 break;
997 }
998 case 'V':
999 {
1000 PyObject *obj = va_arg(vargs, PyObject *);
1001 const char *str = va_arg(vargs, const char *);
1002 if (obj) {
1003 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1004 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1005 s += size;
1006 } else {
1007 appendstring(str);
1008 }
1009 break;
1010 }
1011 case 'S':
1012 case 'R':
1013 {
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001014 const char *str = PyString_AS_STRING(*callresult);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001015 /* unused, since we already have the result */
1016 (void) va_arg(vargs, PyObject *);
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001017 appendstring(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001018 /* We're done with the unicode()/repr() => forget it */
1019 Py_DECREF(*callresult);
1020 /* switch to next unicode()/repr() result */
1021 ++callresult;
1022 break;
1023 }
1024 case 'p':
1025 sprintf(buffer, "%p", va_arg(vargs, void*));
1026 /* %p is ill-defined: ensure leading 0x. */
1027 if (buffer[1] == 'X')
1028 buffer[1] = 'x';
1029 else if (buffer[1] != 'x') {
1030 memmove(buffer+2, buffer, strlen(buffer)+1);
1031 buffer[0] = '0';
1032 buffer[1] = 'x';
1033 }
1034 appendstring(buffer);
1035 break;
1036 case '%':
1037 *s++ = '%';
1038 break;
1039 default:
1040 appendstring(p);
1041 goto end;
1042 }
1043 } else
1044 *s++ = *f;
1045 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001046
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001047 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001048 if (callresults)
1049 PyObject_Free(callresults);
1050 if (abuffer)
1051 PyObject_Free(abuffer);
1052 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1053 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001054 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001055 if (callresults) {
1056 PyObject **callresult2 = callresults;
1057 while (callresult2 < callresult) {
1058 Py_DECREF(*callresult2);
1059 ++callresult2;
1060 }
1061 PyObject_Free(callresults);
1062 }
1063 if (abuffer)
1064 PyObject_Free(abuffer);
1065 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001066}
1067
1068#undef appendstring
1069
1070PyObject *
1071PyUnicode_FromFormat(const char *format, ...)
1072{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001073 PyObject* ret;
1074 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001075
1076#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001077 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001078#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001079 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001080#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001081 ret = PyUnicode_FromFormatV(format, vargs);
1082 va_end(vargs);
1083 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001084}
1085
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001087 wchar_t *w,
1088 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089{
1090 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 PyErr_BadInternalCall();
1092 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001094
1095 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001097 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001098
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099#ifdef HAVE_USABLE_WCHAR_T
1100 memcpy(w, unicode->str, size * sizeof(wchar_t));
1101#else
1102 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001103 register Py_UNICODE *u;
1104 register Py_ssize_t i;
1105 u = PyUnicode_AS_UNICODE(unicode);
1106 for (i = size; i > 0; i--)
1107 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 }
1109#endif
1110
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001111 if (size > PyUnicode_GET_SIZE(unicode))
1112 return PyUnicode_GET_SIZE(unicode);
1113 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001114 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115}
1116
1117#endif
1118
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001119PyObject *PyUnicode_FromOrdinal(int ordinal)
1120{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001121 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122
1123#ifdef Py_UNICODE_WIDE
1124 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001125 PyErr_SetString(PyExc_ValueError,
1126 "unichr() arg not in range(0x110000) "
1127 "(wide Python build)");
1128 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001129 }
1130#else
1131 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001132 PyErr_SetString(PyExc_ValueError,
1133 "unichr() arg not in range(0x10000) "
1134 "(narrow Python build)");
1135 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001136 }
1137#endif
1138
Hye-Shik Chang40574832004-04-06 07:24:51 +00001139 s[0] = (Py_UNICODE)ordinal;
1140 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001141}
1142
Guido van Rossumd57fd912000-03-10 22:53:23 +00001143PyObject *PyUnicode_FromObject(register PyObject *obj)
1144{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001145 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001146 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001147 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001148 Py_INCREF(obj);
1149 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 }
1151 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001152 /* For a Unicode subtype that's not a Unicode object,
1153 return a true Unicode object with the same data. */
1154 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1155 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001156 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001157 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1158}
1159
1160PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001161 const char *encoding,
1162 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001163{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001164 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001165 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001167
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001169 PyErr_BadInternalCall();
1170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001172
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001173#if 0
1174 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001175 that no encodings is given and then redirect to
1176 PyObject_Unicode() which then applies the additional logic for
1177 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001178
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001179 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001180 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001181
1182 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001183 if (PyUnicode_Check(obj)) {
1184 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001185 PyErr_SetString(PyExc_TypeError,
1186 "decoding Unicode is not supported");
1187 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001188 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001189 return PyObject_Unicode(obj);
1190 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001191#else
1192 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001193 PyErr_SetString(PyExc_TypeError,
1194 "decoding Unicode is not supported");
1195 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001196 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001197#endif
1198
1199 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001200 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001201 s = PyString_AS_STRING(obj);
1202 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001203 }
Christian Heimes3497f942008-05-26 12:29:14 +00001204 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001205 /* Python 2.x specific */
1206 PyErr_Format(PyExc_TypeError,
1207 "decoding bytearray is not supported");
1208 return NULL;
1209 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001210 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001211 /* Overwrite the error message with something more useful in
1212 case of a TypeError. */
1213 if (PyErr_ExceptionMatches(PyExc_TypeError))
1214 PyErr_Format(PyExc_TypeError,
1215 "coercing to Unicode: need string or buffer, "
1216 "%.80s found",
1217 Py_TYPE(obj)->tp_name);
1218 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001219 }
Tim Petersced69f82003-09-16 20:30:58 +00001220
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001221 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001222 if (len == 0)
1223 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001224
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001225 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001226 return v;
1227
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001228 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230}
1231
1232PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001233 Py_ssize_t size,
1234 const char *encoding,
1235 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236{
1237 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001238
1239 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001240 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001241
1242 /* Shortcuts for common default encodings */
1243 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001245 else if (strcmp(encoding, "latin-1") == 0)
1246 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001247#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1248 else if (strcmp(encoding, "mbcs") == 0)
1249 return PyUnicode_DecodeMBCS(s, size, errors);
1250#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001251 else if (strcmp(encoding, "ascii") == 0)
1252 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253
1254 /* Decode via the codec registry */
1255 buffer = PyBuffer_FromMemory((void *)s, size);
1256 if (buffer == NULL)
1257 goto onError;
1258 unicode = PyCodec_Decode(buffer, encoding, errors);
1259 if (unicode == NULL)
1260 goto onError;
1261 if (!PyUnicode_Check(unicode)) {
1262 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001263 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001264 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 Py_DECREF(unicode);
1266 goto onError;
1267 }
1268 Py_DECREF(buffer);
1269 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001270
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001271 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272 Py_XDECREF(buffer);
1273 return NULL;
1274}
1275
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001276PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1277 const char *encoding,
1278 const char *errors)
1279{
1280 PyObject *v;
1281
1282 if (!PyUnicode_Check(unicode)) {
1283 PyErr_BadArgument();
1284 goto onError;
1285 }
1286
1287 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001288 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001289
1290 /* Decode via the codec registry */
1291 v = PyCodec_Decode(unicode, encoding, errors);
1292 if (v == NULL)
1293 goto onError;
1294 return v;
1295
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001296 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001297 return NULL;
1298}
1299
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001301 Py_ssize_t size,
1302 const char *encoding,
1303 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304{
1305 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001306
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 unicode = PyUnicode_FromUnicode(s, size);
1308 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001309 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1311 Py_DECREF(unicode);
1312 return v;
1313}
1314
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001315PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1316 const char *encoding,
1317 const char *errors)
1318{
1319 PyObject *v;
1320
1321 if (!PyUnicode_Check(unicode)) {
1322 PyErr_BadArgument();
1323 goto onError;
1324 }
1325
1326 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001327 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001328
1329 /* Encode via the codec registry */
1330 v = PyCodec_Encode(unicode, encoding, errors);
1331 if (v == NULL)
1332 goto onError;
1333 return v;
1334
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001335 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001336 return NULL;
1337}
1338
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1340 const char *encoding,
1341 const char *errors)
1342{
1343 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001344
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument();
1347 goto onError;
1348 }
Fred Drakee4315f52000-05-09 19:53:39 +00001349
Tim Petersced69f82003-09-16 20:30:58 +00001350 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001351 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001352
1353 /* Shortcuts for common default encodings */
1354 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001355 if (strcmp(encoding, "utf-8") == 0)
1356 return PyUnicode_AsUTF8String(unicode);
1357 else if (strcmp(encoding, "latin-1") == 0)
1358 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001359#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001360 else if (strcmp(encoding, "mbcs") == 0)
1361 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001362#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001363 else if (strcmp(encoding, "ascii") == 0)
1364 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366
1367 /* Encode via the codec registry */
1368 v = PyCodec_Encode(unicode, encoding, errors);
1369 if (v == NULL)
1370 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001371 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001373 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001374 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 Py_DECREF(v);
1376 goto onError;
1377 }
1378 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001379
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001380 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 return NULL;
1382}
1383
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001384PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001385 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001386{
1387 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1388
1389 if (v)
1390 return v;
1391 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1392 if (v && errors == NULL)
1393 ((PyUnicodeObject *)unicode)->defenc = v;
1394 return v;
1395}
1396
Guido van Rossumd57fd912000-03-10 22:53:23 +00001397Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1398{
1399 if (!PyUnicode_Check(unicode)) {
1400 PyErr_BadArgument();
1401 goto onError;
1402 }
1403 return PyUnicode_AS_UNICODE(unicode);
1404
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001405 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406 return NULL;
1407}
1408
Martin v. Löwis18e16552006-02-15 17:27:45 +00001409Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
1411 if (!PyUnicode_Check(unicode)) {
1412 PyErr_BadArgument();
1413 goto onError;
1414 }
1415 return PyUnicode_GET_SIZE(unicode);
1416
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001417 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418 return -1;
1419}
1420
Thomas Wouters78890102000-07-22 19:25:51 +00001421const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001422{
1423 return unicode_default_encoding;
1424}
1425
1426int PyUnicode_SetDefaultEncoding(const char *encoding)
1427{
1428 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001429
Fred Drakee4315f52000-05-09 19:53:39 +00001430 /* Make sure the encoding is valid. As side effect, this also
1431 loads the encoding into the codec registry cache. */
1432 v = _PyCodec_Lookup(encoding);
1433 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001434 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001435 Py_DECREF(v);
1436 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001437 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001438 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001439 return 0;
1440
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001442 return -1;
1443}
1444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445/* error handling callback helper:
1446 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001447 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001448 and adjust various state variables.
1449 return 0 on success, -1 on error
1450*/
1451
1452static
1453int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001454 const char *encoding, const char *reason,
1455 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1456 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1457 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001458{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001459 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001460
1461 PyObject *restuple = NULL;
1462 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001463 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1464 Py_ssize_t requiredsize;
1465 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001466 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001468 int res = -1;
1469
1470 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001471 *errorHandler = PyCodec_LookupError(errors);
1472 if (*errorHandler == NULL)
1473 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001474 }
1475
1476 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001477 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001478 encoding, input, insize, *startinpos, *endinpos, reason);
1479 if (*exceptionObject == NULL)
1480 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001481 }
1482 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001483 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1484 goto onError;
1485 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1486 goto onError;
1487 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1488 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001489 }
1490
1491 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1492 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001493 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001495 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001496 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001497 }
1498 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001499 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001501 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001502 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001503 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1504 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001505 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001506
1507 /* need more space? (at least enough for what we
1508 have+the replacement+the rest of the string (starting
1509 at the new input position), so we won't have to check space
1510 when there are no errors in the rest of the string) */
1511 repptr = PyUnicode_AS_UNICODE(repunicode);
1512 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001513 requiredsize = *outpos;
1514 if (requiredsize > PY_SSIZE_T_MAX - repsize)
1515 goto overflow;
1516 requiredsize += repsize;
1517 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1518 goto overflow;
1519 requiredsize += insize - newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001520 if (requiredsize > outsize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001521 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001522 requiredsize = 2*outsize;
1523 if (_PyUnicode_Resize(output, requiredsize) < 0)
1524 goto onError;
1525 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001526 }
1527 *endinpos = newpos;
1528 *inptr = input + newpos;
1529 Py_UNICODE_COPY(*outptr, repptr, repsize);
1530 *outptr += repsize;
1531 *outpos += repsize;
1532 /* we made it! */
1533 res = 0;
1534
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001535 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001536 Py_XDECREF(restuple);
1537 return res;
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001538
1539 overflow:
1540 PyErr_SetString(PyExc_OverflowError,
1541 "decoded result is too long for a Python string");
1542 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001543}
1544
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001545/* --- UTF-7 Codec -------------------------------------------------------- */
1546
Antoine Pitrou653dece2009-05-04 18:32:32 +00001547/* See RFC2152 for details. We encode conservatively and decode liberally. */
1548
1549/* Three simple macros defining base-64. */
1550
1551/* Is c a base-64 character? */
1552
1553#define IS_BASE64(c) \
1554 (isalnum(c) || (c) == '+' || (c) == '/')
1555
1556/* given that c is a base-64 character, what is its base-64 value? */
1557
1558#define FROM_BASE64(c) \
1559 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1560 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1561 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1562 (c) == '+' ? 62 : 63)
1563
1564/* What is the base-64 character of the bottom 6 bits of n? */
1565
1566#define TO_BASE64(n) \
1567 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1568
1569/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1570 * decoded as itself. We are permissive on decoding; the only ASCII
1571 * byte not decoding to itself is the + which begins a base64
1572 * string. */
1573
1574#define DECODE_DIRECT(c) \
1575 ((c) <= 127 && (c) != '+')
1576
1577/* The UTF-7 encoder treats ASCII characters differently according to
1578 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1579 * the above). See RFC2152. This array identifies these different
1580 * sets:
1581 * 0 : "Set D"
1582 * alphanumeric and '(),-./:?
1583 * 1 : "Set O"
1584 * !"#$%&*;<=>@[]^_`{|}
1585 * 2 : "whitespace"
1586 * ht nl cr sp
1587 * 3 : special (must be base64 encoded)
1588 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1589 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001590
Tim Petersced69f82003-09-16 20:30:58 +00001591static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001592char utf7_category[128] = {
1593/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1594 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1595/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1596 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1597/* sp ! " # $ % & ' ( ) * + , - . / */
1598 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1599/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1600 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1601/* @ A B C D E F G H I J K L M N O */
1602 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1603/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1604 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1605/* ` a b c d e f g h i j k l m n o */
1606 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1607/* p q r s t u v w x y z { | } ~ del */
1608 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001609};
1610
Antoine Pitrou653dece2009-05-04 18:32:32 +00001611/* ENCODE_DIRECT: this character should be encoded as itself. The
1612 * answer depends on whether we are encoding set O as itself, and also
1613 * on whether we are encoding whitespace as itself. RFC2152 makes it
1614 * clear that the answers to these questions vary between
1615 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001616
Antoine Pitrou653dece2009-05-04 18:32:32 +00001617#define ENCODE_DIRECT(c, directO, directWS) \
1618 ((c) < 128 && (c) > 0 && \
1619 ((utf7_category[(c)] == 0) || \
1620 (directWS && (utf7_category[(c)] == 2)) || \
1621 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001623PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001624 Py_ssize_t size,
1625 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001627 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1628}
1629
Antoine Pitrou653dece2009-05-04 18:32:32 +00001630/* The decoder. The only state we preserve is our read position,
1631 * i.e. how many characters we have consumed. So if we end in the
1632 * middle of a shift sequence we have to back off the read position
1633 * and the output to the beginning of the sequence, otherwise we lose
1634 * all the shift state (seen bits, number of bits seen, high
1635 * surrogate). */
1636
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001637PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001638 Py_ssize_t size,
1639 const char *errors,
1640 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001641{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001643 Py_ssize_t startinpos;
1644 Py_ssize_t endinpos;
1645 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001646 const char *e;
1647 PyUnicodeObject *unicode;
1648 Py_UNICODE *p;
1649 const char *errmsg = "";
1650 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001651 Py_UNICODE *shiftOutStart;
1652 unsigned int base64bits = 0;
1653 unsigned long base64buffer = 0;
1654 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001655 PyObject *errorHandler = NULL;
1656 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657
1658 unicode = _PyUnicode_New(size);
1659 if (!unicode)
1660 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001661 if (size == 0) {
1662 if (consumed)
1663 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001665 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666
1667 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001668 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001669 e = s + size;
1670
1671 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001672 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673
Antoine Pitrou653dece2009-05-04 18:32:32 +00001674 if (inShift) { /* in a base-64 section */
1675 if (IS_BASE64(ch)) { /* consume a base-64 character */
1676 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1677 base64bits += 6;
1678 s++;
1679 if (base64bits >= 16) {
1680 /* we have enough bits for a UTF-16 value */
1681 Py_UNICODE outCh = (Py_UNICODE)
1682 (base64buffer >> (base64bits-16));
1683 base64bits -= 16;
1684 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001685 assert(outCh <= 0xffff);
Antoine Pitrou653dece2009-05-04 18:32:32 +00001686 if (surrogate) {
1687 /* expecting a second surrogate */
1688 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1689#ifdef Py_UNICODE_WIDE
1690 *p++ = (((surrogate & 0x3FF)<<10)
1691 | (outCh & 0x3FF)) + 0x10000;
1692#else
1693 *p++ = surrogate;
1694 *p++ = outCh;
1695#endif
1696 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001697 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001698 }
1699 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001700 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001701 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001702 }
1703 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001704 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705 /* first surrogate */
1706 surrogate = outCh;
1707 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001708 else {
1709 *p++ = outCh;
1710 }
1711 }
1712 }
1713 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 inShift = 0;
1715 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001716 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001717 *p++ = surrogate;
1718 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001719 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001720 if (base64bits > 0) { /* left-over bits */
1721 if (base64bits >= 6) {
1722 /* We've seen at least one base-64 character */
1723 errmsg = "partial character in shift sequence";
1724 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001725 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001726 else {
1727 /* Some bits remain; they should be zero */
1728 if (base64buffer != 0) {
1729 errmsg = "non-zero padding bits in shift sequence";
1730 goto utf7Error;
1731 }
1732 }
1733 }
1734 if (ch != '-') {
1735 /* '-' is absorbed; other terminating
1736 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001737 *p++ = ch;
1738 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001739 }
1740 }
1741 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001743 s++; /* consume '+' */
1744 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001745 s++;
1746 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001747 }
1748 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001750 shiftOutStart = p;
1751 base64bits = 0;
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001752 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001753 }
1754 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001755 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001756 *p++ = ch;
1757 s++;
1758 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001759 else {
1760 startinpos = s-starts;
1761 s++;
1762 errmsg = "unexpected special character";
1763 goto utf7Error;
1764 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001766utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001767 outpos = p-PyUnicode_AS_UNICODE(unicode);
1768 endinpos = s-starts;
1769 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001770 errors, &errorHandler,
1771 "utf7", errmsg,
1772 starts, size, &startinpos, &endinpos, &exc, &s,
1773 &unicode, &outpos, &p))
1774 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001775 }
1776
Antoine Pitrou653dece2009-05-04 18:32:32 +00001777 /* end of string */
1778
1779 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1780 /* if we're in an inconsistent state, that's an error */
1781 if (surrogate ||
1782 (base64bits >= 6) ||
1783 (base64bits > 0 && base64buffer != 0)) {
1784 outpos = p-PyUnicode_AS_UNICODE(unicode);
1785 endinpos = size;
1786 if (unicode_decode_call_errorhandler(
1787 errors, &errorHandler,
1788 "utf7", "unterminated shift sequence",
1789 starts, size, &startinpos, &endinpos, &exc, &s,
1790 &unicode, &outpos, &p))
1791 goto onError;
1792 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001793 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001794
1795 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001796 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001797 if (inShift) {
1798 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001799 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001800 }
1801 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001802 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001803 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001804 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001805
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001806 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001807 goto onError;
1808
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 Py_XDECREF(errorHandler);
1810 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001811 return (PyObject *)unicode;
1812
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001813 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001814 Py_XDECREF(errorHandler);
1815 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001816 Py_DECREF(unicode);
1817 return NULL;
1818}
1819
1820
1821PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001822 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001823 int base64SetO,
1824 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001825 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001826{
1827 PyObject *v;
1828 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001829 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001830 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001831 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001832 unsigned int base64bits = 0;
1833 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834 char * out;
1835 char * start;
1836
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001837 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001838 return PyErr_NoMemory();
1839
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001840 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001841 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001842
Antoine Pitrou653dece2009-05-04 18:32:32 +00001843 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844 if (v == NULL)
1845 return NULL;
1846
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001847 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001848 for (;i < size; ++i) {
1849 Py_UNICODE ch = s[i];
1850
Antoine Pitrou653dece2009-05-04 18:32:32 +00001851 if (inShift) {
1852 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1853 /* shifting out */
1854 if (base64bits) { /* output remaining bits */
1855 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1856 base64buffer = 0;
1857 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001858 }
1859 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001860 /* Characters not in the BASE64 set implicitly unshift the sequence
1861 so no '-' is required, except if the character is itself a '-' */
1862 if (IS_BASE64(ch) || ch == '-') {
1863 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001864 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001865 *out++ = (char) ch;
1866 }
1867 else {
1868 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001869 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001870 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001871 else { /* not in a shift sequence */
1872 if (ch == '+') {
1873 *out++ = '+';
1874 *out++ = '-';
1875 }
1876 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1877 *out++ = (char) ch;
1878 }
1879 else {
1880 *out++ = '+';
1881 inShift = 1;
1882 goto encode_char;
1883 }
1884 }
1885 continue;
1886encode_char:
1887#ifdef Py_UNICODE_WIDE
1888 if (ch >= 0x10000) {
1889 /* code first surrogate */
1890 base64bits += 16;
1891 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1892 while (base64bits >= 6) {
1893 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1894 base64bits -= 6;
1895 }
1896 /* prepare second surrogate */
1897 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1898 }
1899#endif
1900 base64bits += 16;
1901 base64buffer = (base64buffer << 16) | ch;
1902 while (base64bits >= 6) {
1903 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1904 base64bits -= 6;
1905 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001906 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001907 if (base64bits)
1908 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1909 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001910 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001911
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001912 if (_PyString_Resize(&v, out - start))
1913 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001914 return v;
1915}
1916
Antoine Pitrou653dece2009-05-04 18:32:32 +00001917#undef IS_BASE64
1918#undef FROM_BASE64
1919#undef TO_BASE64
1920#undef DECODE_DIRECT
1921#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001922
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923/* --- UTF-8 Codec -------------------------------------------------------- */
1924
Tim Petersced69f82003-09-16 20:30:58 +00001925static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001927 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1928 illegal prefix. See RFC 3629 for details */
1929 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1930 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001931 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1933 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1934 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1935 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001936 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1937 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1939 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001940 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1941 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1942 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1943 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1944 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945};
1946
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001948 Py_ssize_t size,
1949 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001950{
Walter Dörwald69652032004-09-07 20:24:22 +00001951 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1952}
1953
1954PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001955 Py_ssize_t size,
1956 const char *errors,
1957 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001958{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001961 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001962 Py_ssize_t startinpos;
1963 Py_ssize_t endinpos;
1964 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 const char *e;
1966 PyUnicodeObject *unicode;
1967 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001968 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001969 PyObject *errorHandler = NULL;
1970 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971
1972 /* Note: size will always be longer than the resulting Unicode
1973 character count */
1974 unicode = _PyUnicode_New(size);
1975 if (!unicode)
1976 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001977 if (size == 0) {
1978 if (consumed)
1979 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982
1983 /* Unpack UTF-8 encoded data */
1984 p = unicode->str;
1985 e = s + size;
1986
1987 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001988 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989
1990 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001991 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 s++;
1993 continue;
1994 }
1995
1996 n = utf8_code_length[ch];
1997
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001998 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001999 if (consumed)
2000 break;
2001 else {
2002 errmsg = "unexpected end of data";
2003 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002004 endinpos = startinpos+1;
2005 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2006 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002007 goto utf8Error;
2008 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010
2011 switch (n) {
2012
2013 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002014 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002015 startinpos = s-starts;
2016 endinpos = startinpos+1;
2017 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018
2019 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002020 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002021 startinpos = s-starts;
2022 endinpos = startinpos+1;
2023 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024
2025 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002026 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00002027 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002028 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002029 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002030 goto utf8Error;
2031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002033 assert ((ch > 0x007F) && (ch <= 0x07FF));
2034 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 break;
2036
2037 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002038 /* XXX: surrogates shouldn't be valid UTF-8!
2039 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2040 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2041 Uncomment the 2 lines below to make them invalid,
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002042 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00002043 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002044 (s[2] & 0xc0) != 0x80 ||
2045 ((unsigned char)s[0] == 0xE0 &&
2046 (unsigned char)s[1] < 0xA0)/* ||
2047 ((unsigned char)s[0] == 0xED &&
2048 (unsigned char)s[1] > 0x9F)*/) {
2049 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002050 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002051 endinpos = startinpos + 1;
2052
2053 /* if s[1] first two bits are 1 and 0, then the invalid
2054 continuation byte is s[2], so increment endinpos by 1,
2055 if not, s[1] is invalid and endinpos doesn't need to
2056 be incremented. */
2057 if ((s[1] & 0xC0) == 0x80)
2058 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002059 goto utf8Error;
2060 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002062 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2063 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002064 break;
2065
2066 case 4:
2067 if ((s[1] & 0xc0) != 0x80 ||
2068 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002069 (s[3] & 0xc0) != 0x80 ||
2070 ((unsigned char)s[0] == 0xF0 &&
2071 (unsigned char)s[1] < 0x90) ||
2072 ((unsigned char)s[0] == 0xF4 &&
2073 (unsigned char)s[1] > 0x8F)) {
2074 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002075 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002076 endinpos = startinpos + 1;
2077 if ((s[1] & 0xC0) == 0x80) {
2078 endinpos++;
2079 if ((s[2] & 0xC0) == 0x80)
2080 endinpos++;
2081 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002082 goto utf8Error;
2083 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002084 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002085 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2086 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2087
Fredrik Lundh8f455852001-06-27 18:59:43 +00002088#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002089 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002090#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002091 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002092
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002093 /* translate from 10000..10FFFF to 0..FFFF */
2094 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002095
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002096 /* high surrogate = top 10 bits added to D800 */
2097 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002098
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002099 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002100 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002101#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103 }
2104 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002105 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002106
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002107 utf8Error:
2108 outpos = p-PyUnicode_AS_UNICODE(unicode);
2109 if (unicode_decode_call_errorhandler(
2110 errors, &errorHandler,
2111 "utf8", errmsg,
2112 starts, size, &startinpos, &endinpos, &exc, &s,
2113 &unicode, &outpos, &p))
2114 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 }
Walter Dörwald69652032004-09-07 20:24:22 +00002116 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002117 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118
2119 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002120 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 goto onError;
2122
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002123 Py_XDECREF(errorHandler);
2124 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 return (PyObject *)unicode;
2126
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002127 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002128 Py_XDECREF(errorHandler);
2129 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 Py_DECREF(unicode);
2131 return NULL;
2132}
2133
Tim Peters602f7402002-04-27 18:03:26 +00002134/* Allocation strategy: if the string is short, convert into a stack buffer
2135 and allocate exactly as much space needed at the end. Else allocate the
2136 maximum possible needed (4 result bytes per Unicode character), and return
2137 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002138*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002139PyObject *
2140PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002141 Py_ssize_t size,
2142 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143{
Tim Peters602f7402002-04-27 18:03:26 +00002144#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002145
Martin v. Löwis18e16552006-02-15 17:27:45 +00002146 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002147 PyObject *v; /* result string object */
2148 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002149 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002150 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002151 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002152
Tim Peters602f7402002-04-27 18:03:26 +00002153 assert(s != NULL);
2154 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155
Tim Peters602f7402002-04-27 18:03:26 +00002156 if (size <= MAX_SHORT_UNICHARS) {
2157 /* Write into the stack buffer; nallocated can't overflow.
2158 * At the end, we'll allocate exactly as much heap space as it
2159 * turns out we need.
2160 */
2161 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2162 v = NULL; /* will allocate after we're done */
2163 p = stackbuf;
2164 }
2165 else {
2166 /* Overallocate on the heap, and give the excess back at the end. */
2167 nallocated = size * 4;
2168 if (nallocated / 4 != size) /* overflow! */
2169 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002170 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002171 if (v == NULL)
2172 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002173 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002174 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002175
Tim Peters602f7402002-04-27 18:03:26 +00002176 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002177 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002178
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002179 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002180 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002182
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002184 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002185 *p++ = (char)(0xc0 | (ch >> 6));
2186 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002187 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002188 else {
Tim Peters602f7402002-04-27 18:03:26 +00002189 /* Encode UCS2 Unicode ordinals */
2190 if (ch < 0x10000) {
2191 /* Special case: check for high surrogate */
2192 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2193 Py_UCS4 ch2 = s[i];
2194 /* Check for low surrogate and combine the two to
2195 form a UCS4 value */
2196 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002197 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002198 i++;
2199 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002200 }
Tim Peters602f7402002-04-27 18:03:26 +00002201 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002202 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002203 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002204 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2205 *p++ = (char)(0x80 | (ch & 0x3f));
2206 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002207 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002208 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002209 /* Encode UCS4 Unicode ordinals */
2210 *p++ = (char)(0xf0 | (ch >> 18));
2211 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2212 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2213 *p++ = (char)(0x80 | (ch & 0x3f));
2214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002216
Tim Peters602f7402002-04-27 18:03:26 +00002217 if (v == NULL) {
2218 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002219 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002220 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002221 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002222 }
2223 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002224 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002225 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002226 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002227 if (_PyString_Resize(&v, nneeded))
2228 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002231
Tim Peters602f7402002-04-27 18:03:26 +00002232#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233}
2234
Guido van Rossumd57fd912000-03-10 22:53:23 +00002235PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2236{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237 if (!PyUnicode_Check(unicode)) {
2238 PyErr_BadArgument();
2239 return NULL;
2240 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002241 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002242 PyUnicode_GET_SIZE(unicode),
2243 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244}
2245
Walter Dörwald6e390802007-08-17 16:41:28 +00002246/* --- UTF-32 Codec ------------------------------------------------------- */
2247
2248PyObject *
2249PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002250 Py_ssize_t size,
2251 const char *errors,
2252 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002253{
2254 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2255}
2256
2257PyObject *
2258PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002259 Py_ssize_t size,
2260 const char *errors,
2261 int *byteorder,
2262 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002263{
2264 const char *starts = s;
2265 Py_ssize_t startinpos;
2266 Py_ssize_t endinpos;
2267 Py_ssize_t outpos;
2268 PyUnicodeObject *unicode;
2269 Py_UNICODE *p;
2270#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002271 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002272 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002273#else
2274 const int pairs = 0;
2275#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002276 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002277 int bo = 0; /* assume native ordering by default */
2278 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002279 /* Offsets from q for retrieving bytes in the right order. */
2280#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2281 int iorder[] = {0, 1, 2, 3};
2282#else
2283 int iorder[] = {3, 2, 1, 0};
2284#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002285 PyObject *errorHandler = NULL;
2286 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002287
Walter Dörwald6e390802007-08-17 16:41:28 +00002288 q = (unsigned char *)s;
2289 e = q + size;
2290
2291 if (byteorder)
2292 bo = *byteorder;
2293
2294 /* Check for BOM marks (U+FEFF) in the input and adjust current
2295 byte order setting accordingly. In native mode, the leading BOM
2296 mark is skipped, in all other modes, it is copied to the output
2297 stream as-is (giving a ZWNBSP character). */
2298 if (bo == 0) {
2299 if (size >= 4) {
2300 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002301 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002302#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002303 if (bom == 0x0000FEFF) {
2304 q += 4;
2305 bo = -1;
2306 }
2307 else if (bom == 0xFFFE0000) {
2308 q += 4;
2309 bo = 1;
2310 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002311#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002312 if (bom == 0x0000FEFF) {
2313 q += 4;
2314 bo = 1;
2315 }
2316 else if (bom == 0xFFFE0000) {
2317 q += 4;
2318 bo = -1;
2319 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002320#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002321 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002322 }
2323
2324 if (bo == -1) {
2325 /* force LE */
2326 iorder[0] = 0;
2327 iorder[1] = 1;
2328 iorder[2] = 2;
2329 iorder[3] = 3;
2330 }
2331 else if (bo == 1) {
2332 /* force BE */
2333 iorder[0] = 3;
2334 iorder[1] = 2;
2335 iorder[2] = 1;
2336 iorder[3] = 0;
2337 }
2338
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002339 /* On narrow builds we split characters outside the BMP into two
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002340 code points => count how much extra space we need. */
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002341#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002342 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002343 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2344 pairs++;
2345#endif
2346
2347 /* This might be one to much, because of a BOM */
2348 unicode = _PyUnicode_New((size+3)/4+pairs);
2349 if (!unicode)
2350 return NULL;
2351 if (size == 0)
2352 return (PyObject *)unicode;
2353
2354 /* Unpack UTF-32 encoded data */
2355 p = unicode->str;
2356
Walter Dörwald6e390802007-08-17 16:41:28 +00002357 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002358 Py_UCS4 ch;
2359 /* remaining bytes at the end? (size should be divisible by 4) */
2360 if (e-q<4) {
2361 if (consumed)
2362 break;
2363 errmsg = "truncated data";
2364 startinpos = ((const char *)q)-starts;
2365 endinpos = ((const char *)e)-starts;
2366 goto utf32Error;
2367 /* The remaining input chars are ignored if the callback
2368 chooses to skip the input */
2369 }
2370 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2371 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002372
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002373 if (ch >= 0x110000)
2374 {
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002375 errmsg = "code point not in range(0x110000)";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002376 startinpos = ((const char *)q)-starts;
2377 endinpos = startinpos+4;
2378 goto utf32Error;
2379 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002380#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002381 if (ch >= 0x10000)
2382 {
2383 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2384 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2385 }
2386 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002387#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002388 *p++ = ch;
2389 q += 4;
2390 continue;
2391 utf32Error:
2392 outpos = p-PyUnicode_AS_UNICODE(unicode);
2393 if (unicode_decode_call_errorhandler(
2394 errors, &errorHandler,
2395 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002396 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002397 &unicode, &outpos, &p))
2398 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002399 }
2400
2401 if (byteorder)
2402 *byteorder = bo;
2403
2404 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002405 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002406
2407 /* Adjust length */
2408 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2409 goto onError;
2410
2411 Py_XDECREF(errorHandler);
2412 Py_XDECREF(exc);
2413 return (PyObject *)unicode;
2414
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002415 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002416 Py_DECREF(unicode);
2417 Py_XDECREF(errorHandler);
2418 Py_XDECREF(exc);
2419 return NULL;
2420}
2421
2422PyObject *
2423PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002424 Py_ssize_t size,
2425 const char *errors,
2426 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002427{
2428 PyObject *v;
2429 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002430 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002431#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002432 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002433#else
2434 const int pairs = 0;
2435#endif
2436 /* Offsets from p for storing byte pairs in the right order. */
2437#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2438 int iorder[] = {0, 1, 2, 3};
2439#else
2440 int iorder[] = {3, 2, 1, 0};
2441#endif
2442
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002443#define STORECHAR(CH) \
2444 do { \
2445 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2446 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2447 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2448 p[iorder[0]] = (CH) & 0xff; \
2449 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002450 } while(0)
2451
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002452 /* In narrow builds we can output surrogate pairs as one code point,
Walter Dörwald6e390802007-08-17 16:41:28 +00002453 so we need less space. */
2454#ifndef Py_UNICODE_WIDE
2455 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002456 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2457 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2458 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002459#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002460 nsize = (size - pairs + (byteorder == 0));
2461 bytesize = nsize * 4;
2462 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002463 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002464 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002465 if (v == NULL)
2466 return NULL;
2467
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002468 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002469 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002470 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002471 if (size == 0)
2472 return v;
2473
2474 if (byteorder == -1) {
2475 /* force LE */
2476 iorder[0] = 0;
2477 iorder[1] = 1;
2478 iorder[2] = 2;
2479 iorder[3] = 3;
2480 }
2481 else if (byteorder == 1) {
2482 /* force BE */
2483 iorder[0] = 3;
2484 iorder[1] = 2;
2485 iorder[2] = 1;
2486 iorder[3] = 0;
2487 }
2488
2489 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002490 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002491#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002492 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2493 Py_UCS4 ch2 = *s;
2494 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2495 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2496 s++;
2497 size--;
2498 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002499 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002500#endif
2501 STORECHAR(ch);
2502 }
2503 return v;
2504#undef STORECHAR
2505}
2506
2507PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2508{
2509 if (!PyUnicode_Check(unicode)) {
2510 PyErr_BadArgument();
2511 return NULL;
2512 }
2513 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002514 PyUnicode_GET_SIZE(unicode),
2515 NULL,
2516 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002517}
2518
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519/* --- UTF-16 Codec ------------------------------------------------------- */
2520
Tim Peters772747b2001-08-09 22:21:55 +00002521PyObject *
2522PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002523 Py_ssize_t size,
2524 const char *errors,
2525 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526{
Walter Dörwald69652032004-09-07 20:24:22 +00002527 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2528}
2529
2530PyObject *
2531PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002532 Py_ssize_t size,
2533 const char *errors,
2534 int *byteorder,
2535 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002537 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002538 Py_ssize_t startinpos;
2539 Py_ssize_t endinpos;
2540 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541 PyUnicodeObject *unicode;
2542 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002543 const unsigned char *q, *e;
2544 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002545 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002546 /* Offsets from q for retrieving byte pairs in the right order. */
2547#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2548 int ihi = 1, ilo = 0;
2549#else
2550 int ihi = 0, ilo = 1;
2551#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002552 PyObject *errorHandler = NULL;
2553 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554
2555 /* Note: size will always be longer than the resulting Unicode
2556 character count */
2557 unicode = _PyUnicode_New(size);
2558 if (!unicode)
2559 return NULL;
2560 if (size == 0)
2561 return (PyObject *)unicode;
2562
2563 /* Unpack UTF-16 encoded data */
2564 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002565 q = (unsigned char *)s;
2566 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002567
2568 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002569 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002571 /* Check for BOM marks (U+FEFF) in the input and adjust current
2572 byte order setting accordingly. In native mode, the leading BOM
2573 mark is skipped, in all other modes, it is copied to the output
2574 stream as-is (giving a ZWNBSP character). */
2575 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002576 if (size >= 2) {
2577 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002578#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002579 if (bom == 0xFEFF) {
2580 q += 2;
2581 bo = -1;
2582 }
2583 else if (bom == 0xFFFE) {
2584 q += 2;
2585 bo = 1;
2586 }
Tim Petersced69f82003-09-16 20:30:58 +00002587#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002588 if (bom == 0xFEFF) {
2589 q += 2;
2590 bo = 1;
2591 }
2592 else if (bom == 0xFFFE) {
2593 q += 2;
2594 bo = -1;
2595 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002596#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002597 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599
Tim Peters772747b2001-08-09 22:21:55 +00002600 if (bo == -1) {
2601 /* force LE */
2602 ihi = 1;
2603 ilo = 0;
2604 }
2605 else if (bo == 1) {
2606 /* force BE */
2607 ihi = 0;
2608 ilo = 1;
2609 }
2610
2611 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002612 Py_UNICODE ch;
2613 /* remaining bytes at the end? (size should be even) */
2614 if (e-q<2) {
2615 if (consumed)
2616 break;
2617 errmsg = "truncated data";
2618 startinpos = ((const char *)q)-starts;
2619 endinpos = ((const char *)e)-starts;
2620 goto utf16Error;
2621 /* The remaining input chars are ignored if the callback
2622 chooses to skip the input */
2623 }
2624 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002625
Benjamin Peterson857ce152009-01-31 16:29:18 +00002626 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002627
2628 if (ch < 0xD800 || ch > 0xDFFF) {
2629 *p++ = ch;
2630 continue;
2631 }
2632
2633 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002634 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002635 q -= 2;
2636 if (consumed)
2637 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002638 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002639 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002640 endinpos = ((const char *)e)-starts;
2641 goto utf16Error;
2642 }
2643 if (0xD800 <= ch && ch <= 0xDBFF) {
2644 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2645 q += 2;
2646 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002647#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002648 *p++ = ch;
2649 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002650#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002651 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002652#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002653 continue;
2654 }
2655 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002656 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002657 startinpos = (((const char *)q)-4)-starts;
2658 endinpos = startinpos+2;
2659 goto utf16Error;
2660 }
2661
Benjamin Peterson857ce152009-01-31 16:29:18 +00002662 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002663 errmsg = "illegal encoding";
2664 startinpos = (((const char *)q)-2)-starts;
2665 endinpos = startinpos+2;
2666 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002667
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002668 utf16Error:
2669 outpos = p-PyUnicode_AS_UNICODE(unicode);
2670 if (unicode_decode_call_errorhandler(
2671 errors, &errorHandler,
2672 "utf16", errmsg,
2673 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2674 &unicode, &outpos, &p))
2675 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 }
2677
2678 if (byteorder)
2679 *byteorder = bo;
2680
Walter Dörwald69652032004-09-07 20:24:22 +00002681 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002682 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002683
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002685 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686 goto onError;
2687
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002688 Py_XDECREF(errorHandler);
2689 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 return (PyObject *)unicode;
2691
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002692 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 Py_XDECREF(errorHandler);
2695 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 return NULL;
2697}
2698
Tim Peters772747b2001-08-09 22:21:55 +00002699PyObject *
2700PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002701 Py_ssize_t size,
2702 const char *errors,
2703 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704{
2705 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002706 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002707 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002708#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002709 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002710#else
2711 const int pairs = 0;
2712#endif
Tim Peters772747b2001-08-09 22:21:55 +00002713 /* Offsets from p for storing byte pairs in the right order. */
2714#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2715 int ihi = 1, ilo = 0;
2716#else
2717 int ihi = 0, ilo = 1;
2718#endif
2719
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002720#define STORECHAR(CH) \
2721 do { \
2722 p[ihi] = ((CH) >> 8) & 0xff; \
2723 p[ilo] = (CH) & 0xff; \
2724 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002725 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002727#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002728 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002729 if (s[i] >= 0x10000)
2730 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002731#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002732 /* 2 * (size + pairs + (byteorder == 0)) */
2733 if (size > PY_SSIZE_T_MAX ||
2734 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002735 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002736 nsize = size + pairs + (byteorder == 0);
2737 bytesize = nsize * 2;
2738 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002739 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002740 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 if (v == NULL)
2742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002744 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002746 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002747 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002748 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002749
2750 if (byteorder == -1) {
2751 /* force LE */
2752 ihi = 1;
2753 ilo = 0;
2754 }
2755 else if (byteorder == 1) {
2756 /* force BE */
2757 ihi = 0;
2758 ilo = 1;
2759 }
2760
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002761 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002762 Py_UNICODE ch = *s++;
2763 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002764#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002765 if (ch >= 0x10000) {
2766 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2767 ch = 0xD800 | ((ch-0x10000) >> 10);
2768 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002769#endif
Tim Peters772747b2001-08-09 22:21:55 +00002770 STORECHAR(ch);
2771 if (ch2)
2772 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002773 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002775#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776}
2777
2778PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2779{
2780 if (!PyUnicode_Check(unicode)) {
2781 PyErr_BadArgument();
2782 return NULL;
2783 }
2784 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002785 PyUnicode_GET_SIZE(unicode),
2786 NULL,
2787 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788}
2789
2790/* --- Unicode Escape Codec ----------------------------------------------- */
2791
Fredrik Lundh06d12682001-01-24 07:59:11 +00002792static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002793
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002795 Py_ssize_t size,
2796 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002798 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002799 Py_ssize_t startinpos;
2800 Py_ssize_t endinpos;
2801 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002805 char* message;
2806 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 PyObject *errorHandler = NULL;
2808 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002809
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 /* Escaped strings will always be longer than the resulting
2811 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002812 length after conversion to the true value.
2813 (but if the error callback returns a long replacement string
2814 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 v = _PyUnicode_New(size);
2816 if (v == NULL)
2817 goto onError;
2818 if (size == 0)
2819 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002820
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002823
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 while (s < end) {
2825 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002826 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828
2829 /* Non-escape characters are interpreted as Unicode ordinals */
2830 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002831 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832 continue;
2833 }
2834
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 /* \ - Escapes */
2837 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002838 c = *s++;
2839 if (s > end)
2840 c = '\0'; /* Invalid after \ */
2841 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002843 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844 case '\n': break;
2845 case '\\': *p++ = '\\'; break;
2846 case '\'': *p++ = '\''; break;
2847 case '\"': *p++ = '\"'; break;
2848 case 'b': *p++ = '\b'; break;
2849 case 'f': *p++ = '\014'; break; /* FF */
2850 case 't': *p++ = '\t'; break;
2851 case 'n': *p++ = '\n'; break;
2852 case 'r': *p++ = '\r'; break;
2853 case 'v': *p++ = '\013'; break; /* VT */
2854 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2855
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002856 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 case '0': case '1': case '2': case '3':
2858 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002859 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002860 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002861 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002862 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002863 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002865 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 break;
2867
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002868 /* hex escapes */
2869 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002871 digits = 2;
2872 message = "truncated \\xXX escape";
2873 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002875 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002877 digits = 4;
2878 message = "truncated \\uXXXX escape";
2879 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002881 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002882 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002883 digits = 8;
2884 message = "truncated \\UXXXXXXXX escape";
2885 hexescape:
2886 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002887 if (end - s < digits) {
2888 /* count only hex digits */
2889 for (; s < end; ++s) {
2890 c = (unsigned char)*s;
2891 if (!Py_ISXDIGIT(c))
2892 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002893 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002894 goto error;
2895 }
2896 for (; digits--; ++s) {
2897 c = (unsigned char)*s;
2898 if (!Py_ISXDIGIT(c))
2899 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002900 chr = (chr<<4) & ~0xF;
2901 if (c >= '0' && c <= '9')
2902 chr += c - '0';
2903 else if (c >= 'a' && c <= 'f')
2904 chr += 10 + c - 'a';
2905 else
2906 chr += 10 + c - 'A';
2907 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002908 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002909 /* _decoding_error will have already written into the
2910 target buffer. */
2911 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002912 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002913 /* when we get here, chr is a 32-bit unicode character */
2914 if (chr <= 0xffff)
2915 /* UCS-2 character */
2916 *p++ = (Py_UNICODE) chr;
2917 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002918 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002919 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002920#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002921 *p++ = chr;
2922#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002923 chr -= 0x10000L;
2924 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002925 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002926#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002927 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002928 message = "illegal Unicode character";
2929 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002930 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002931 break;
2932
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002933 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002934 case 'N':
2935 message = "malformed \\N character escape";
2936 if (ucnhash_CAPI == NULL) {
2937 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002939 if (ucnhash_CAPI == NULL)
2940 goto ucnhashError;
2941 }
2942 if (*s == '{') {
2943 const char *start = s+1;
2944 /* look for the closing brace */
2945 while (*s != '}' && s < end)
2946 s++;
2947 if (s > start && s < end && *s == '}') {
2948 /* found a name. look it up in the unicode database */
2949 message = "unknown Unicode character name";
2950 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002951 if (s - start - 1 <= INT_MAX &&
2952 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002953 goto store;
2954 }
2955 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002956 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002957
2958 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002959 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002960 message = "\\ at end of string";
2961 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002962 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002963 }
2964 else {
2965 *p++ = '\\';
2966 *p++ = (unsigned char)s[-1];
2967 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002968 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002970 continue;
2971
2972 error:
2973 endinpos = s-starts;
2974 outpos = p-PyUnicode_AS_UNICODE(v);
2975 if (unicode_decode_call_errorhandler(
2976 errors, &errorHandler,
2977 "unicodeescape", message,
2978 starts, size, &startinpos, &endinpos, &exc, &s,
2979 &v, &outpos, &p))
2980 goto onError;
2981 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002983 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002985 Py_XDECREF(errorHandler);
2986 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002988
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002989 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002990 PyErr_SetString(
2991 PyExc_UnicodeError,
2992 "\\N escapes not supported (can't load unicodedata module)"
2993 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002994 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002995 Py_XDECREF(errorHandler);
2996 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002997 return NULL;
2998
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002999 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003001 Py_XDECREF(errorHandler);
3002 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003 return NULL;
3004}
3005
3006/* Return a Unicode-Escape string version of the Unicode object.
3007
3008 If quotes is true, the string is enclosed in u"" or u'' quotes as
3009 appropriate.
3010
3011*/
3012
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00003013Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003014 Py_ssize_t size,
3015 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00003016{
3017 /* like wcschr, but doesn't stop at NULL characters */
3018
3019 while (size-- > 0) {
3020 if (*s == ch)
3021 return s;
3022 s++;
3023 }
3024
3025 return NULL;
3026}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003027
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028static
3029PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003030 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 int quotes)
3032{
3033 PyObject *repr;
3034 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003036 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00003037#ifdef Py_UNICODE_WIDE
3038 const Py_ssize_t expandsize = 10;
3039#else
3040 const Py_ssize_t expandsize = 6;
3041#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042
Neal Norwitz17753ec2006-08-21 22:21:19 +00003043 /* XXX(nnorwitz): rather than over-allocating, it would be
3044 better to choose a different scheme. Perhaps scan the
3045 first N-chars of the string and allocate based on that size.
3046 */
3047 /* Initial allocation is based on the longest-possible unichr
3048 escape.
3049
3050 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3051 unichr, so in this case it's the longest unichr escape. In
3052 narrow (UTF-16) builds this is five chars per source unichr
3053 since there are two unichrs in the surrogate pair, so in narrow
3054 (UTF-16) builds it's not the longest unichr escape.
3055
3056 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3057 so in the narrow (UTF-16) build case it's the longest unichr
3058 escape.
3059 */
3060
Neal Norwitze7d8be82008-07-31 17:17:14 +00003061 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003062 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003063
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003064 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003065 2
3066 + expandsize*size
3067 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 if (repr == NULL)
3069 return NULL;
3070
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003071 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072
3073 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003075 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 !findchar(s, size, '"')) ? '"' : '\'';
3077 }
3078 while (size-- > 0) {
3079 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003080
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003081 /* Escape quotes and backslashes */
3082 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003083 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 *p++ = '\\';
3085 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003086 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003087 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003088
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003089#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003090 /* Map 21-bit characters to '\U00xxxxxx' */
3091 else if (ch >= 0x10000) {
3092 *p++ = '\\';
3093 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003094 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3095 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3096 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3097 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3098 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3099 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3100 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003101 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003102 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003103 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003104#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003105 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3106 else if (ch >= 0xD800 && ch < 0xDC00) {
3107 Py_UNICODE ch2;
3108 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003109
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003110 ch2 = *s++;
3111 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003112 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003113 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3114 *p++ = '\\';
3115 *p++ = 'U';
3116 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3117 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3118 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3119 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3120 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3121 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3122 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3123 *p++ = hexdigit[ucs & 0x0000000F];
3124 continue;
3125 }
3126 /* Fall through: isolated surrogates are copied as-is */
3127 s--;
3128 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003129 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003130#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003131
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003133 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134 *p++ = '\\';
3135 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003136 *p++ = hexdigit[(ch >> 12) & 0x000F];
3137 *p++ = hexdigit[(ch >> 8) & 0x000F];
3138 *p++ = hexdigit[(ch >> 4) & 0x000F];
3139 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003141
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003142 /* Map special whitespace to '\t', \n', '\r' */
3143 else if (ch == '\t') {
3144 *p++ = '\\';
3145 *p++ = 't';
3146 }
3147 else if (ch == '\n') {
3148 *p++ = '\\';
3149 *p++ = 'n';
3150 }
3151 else if (ch == '\r') {
3152 *p++ = '\\';
3153 *p++ = 'r';
3154 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003155
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003156 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003157 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003159 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003160 *p++ = hexdigit[(ch >> 4) & 0x000F];
3161 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003162 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003163
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 /* Copy everything else as-is */
3165 else
3166 *p++ = (char) ch;
3167 }
3168 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003169 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170
3171 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003172 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3173 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174 return repr;
3175}
3176
3177PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003178 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179{
3180 return unicodeescape_string(s, size, 0);
3181}
3182
3183PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3184{
3185 if (!PyUnicode_Check(unicode)) {
3186 PyErr_BadArgument();
3187 return NULL;
3188 }
3189 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003190 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191}
3192
3193/* --- Raw Unicode Escape Codec ------------------------------------------- */
3194
3195PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003196 Py_ssize_t size,
3197 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003199 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003200 Py_ssize_t startinpos;
3201 Py_ssize_t endinpos;
3202 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003204 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 const char *end;
3206 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003207 PyObject *errorHandler = NULL;
3208 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003209
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 /* Escaped strings will always be longer than the resulting
3211 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003212 length after conversion to the true value. (But decoding error
3213 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 v = _PyUnicode_New(size);
3215 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003216 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003218 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003219 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 end = s + size;
3221 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003222 unsigned char c;
3223 Py_UCS4 x;
3224 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003225 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003227 /* Non-escape characters are interpreted as Unicode ordinals */
3228 if (*s != '\\') {
3229 *p++ = (unsigned char)*s++;
3230 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003231 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003232 startinpos = s-starts;
3233
3234 /* \u-escapes are only interpreted iff the number of leading
3235 backslashes if odd */
3236 bs = s;
3237 for (;s < end;) {
3238 if (*s != '\\')
3239 break;
3240 *p++ = (unsigned char)*s++;
3241 }
3242 if (((s - bs) & 1) == 0 ||
3243 s >= end ||
3244 (*s != 'u' && *s != 'U')) {
3245 continue;
3246 }
3247 p--;
3248 count = *s=='u' ? 4 : 8;
3249 s++;
3250
3251 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3252 outpos = p-PyUnicode_AS_UNICODE(v);
3253 for (x = 0, i = 0; i < count; ++i, ++s) {
3254 c = (unsigned char)*s;
3255 if (!isxdigit(c)) {
3256 endinpos = s-starts;
3257 if (unicode_decode_call_errorhandler(
3258 errors, &errorHandler,
3259 "rawunicodeescape", "truncated \\uXXXX",
3260 starts, size, &startinpos, &endinpos, &exc, &s,
3261 &v, &outpos, &p))
3262 goto onError;
3263 goto nextByte;
3264 }
3265 x = (x<<4) & ~0xF;
3266 if (c >= '0' && c <= '9')
3267 x += c - '0';
3268 else if (c >= 'a' && c <= 'f')
3269 x += 10 + c - 'a';
3270 else
3271 x += 10 + c - 'A';
3272 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003273 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003274 /* UCS-2 character */
3275 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003276 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003277 /* UCS-4 character. Either store directly, or as
3278 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003279#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003280 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003281#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003282 x -= 0x10000L;
3283 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3284 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003285#endif
3286 } else {
3287 endinpos = s-starts;
3288 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003289 if (unicode_decode_call_errorhandler(
3290 errors, &errorHandler,
3291 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003292 starts, size, &startinpos, &endinpos, &exc, &s,
3293 &v, &outpos, &p))
3294 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003295 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003296 nextByte:
3297 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003299 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003300 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003301 Py_XDECREF(errorHandler);
3302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003304
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003305 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003307 Py_XDECREF(errorHandler);
3308 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 return NULL;
3310}
3311
3312PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003313 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314{
3315 PyObject *repr;
3316 char *p;
3317 char *q;
3318
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003319 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003320#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003321 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003322#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003323 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003324#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003325
Neal Norwitze7d8be82008-07-31 17:17:14 +00003326 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003327 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003328
Neal Norwitze7d8be82008-07-31 17:17:14 +00003329 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 if (repr == NULL)
3331 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003332 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003333 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003335 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336 while (size-- > 0) {
3337 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003338#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003339 /* Map 32-bit characters to '\Uxxxxxxxx' */
3340 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003341 *p++ = '\\';
3342 *p++ = 'U';
3343 *p++ = hexdigit[(ch >> 28) & 0xf];
3344 *p++ = hexdigit[(ch >> 24) & 0xf];
3345 *p++ = hexdigit[(ch >> 20) & 0xf];
3346 *p++ = hexdigit[(ch >> 16) & 0xf];
3347 *p++ = hexdigit[(ch >> 12) & 0xf];
3348 *p++ = hexdigit[(ch >> 8) & 0xf];
3349 *p++ = hexdigit[(ch >> 4) & 0xf];
3350 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003351 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003352 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003353#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003354 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3355 if (ch >= 0xD800 && ch < 0xDC00) {
3356 Py_UNICODE ch2;
3357 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003358
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003359 ch2 = *s++;
3360 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003361 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003362 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3363 *p++ = '\\';
3364 *p++ = 'U';
3365 *p++ = hexdigit[(ucs >> 28) & 0xf];
3366 *p++ = hexdigit[(ucs >> 24) & 0xf];
3367 *p++ = hexdigit[(ucs >> 20) & 0xf];
3368 *p++ = hexdigit[(ucs >> 16) & 0xf];
3369 *p++ = hexdigit[(ucs >> 12) & 0xf];
3370 *p++ = hexdigit[(ucs >> 8) & 0xf];
3371 *p++ = hexdigit[(ucs >> 4) & 0xf];
3372 *p++ = hexdigit[ucs & 0xf];
3373 continue;
3374 }
3375 /* Fall through: isolated surrogates are copied as-is */
3376 s--;
3377 size++;
3378 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003379#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003380 /* Map 16-bit characters to '\uxxxx' */
3381 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382 *p++ = '\\';
3383 *p++ = 'u';
3384 *p++ = hexdigit[(ch >> 12) & 0xf];
3385 *p++ = hexdigit[(ch >> 8) & 0xf];
3386 *p++ = hexdigit[(ch >> 4) & 0xf];
3387 *p++ = hexdigit[ch & 15];
3388 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003389 /* Copy everything else as-is */
3390 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 *p++ = (char) ch;
3392 }
3393 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003394 if (_PyString_Resize(&repr, p - q))
3395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396 return repr;
3397}
3398
3399PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3400{
3401 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003402 PyErr_BadArgument();
3403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404 }
3405 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003406 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407}
3408
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003409/* --- Unicode Internal Codec ------------------------------------------- */
3410
3411PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003412 Py_ssize_t size,
3413 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003414{
3415 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003416 Py_ssize_t startinpos;
3417 Py_ssize_t endinpos;
3418 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003419 PyUnicodeObject *v;
3420 Py_UNICODE *p;
3421 const char *end;
3422 const char *reason;
3423 PyObject *errorHandler = NULL;
3424 PyObject *exc = NULL;
3425
Neal Norwitzd43069c2006-01-08 01:12:10 +00003426#ifdef Py_UNICODE_WIDE
3427 Py_UNICODE unimax = PyUnicode_GetMax();
3428#endif
3429
Armin Rigo7ccbca92006-10-04 12:17:45 +00003430 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003431 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3432 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003433 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003434 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003435 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003436 p = PyUnicode_AS_UNICODE(v);
3437 end = s + size;
3438
3439 while (s < end) {
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003440 if (end-s < Py_UNICODE_SIZE) {
3441 endinpos = end-starts;
3442 reason = "truncated input";
3443 goto error;
3444 }
Neal Norwitz1004a532006-05-15 07:17:23 +00003445 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003446#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003447 /* We have to sanity check the raw data, otherwise doom looms for
3448 some malformed UCS-4 data. */
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003449 if (*p > unimax || *p < 0) {
3450 endinpos = s - starts + Py_UNICODE_SIZE;
3451 reason = "illegal code point (> 0x10FFFF)";
3452 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003453 }
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003454#endif
3455 p++;
3456 s += Py_UNICODE_SIZE;
3457 continue;
3458
3459 error:
3460 startinpos = s - starts;
3461 outpos = p - PyUnicode_AS_UNICODE(v);
3462 if (unicode_decode_call_errorhandler(
3463 errors, &errorHandler,
3464 "unicode_internal", reason,
3465 starts, size, &startinpos, &endinpos, &exc, &s,
3466 &v, &outpos, &p)) {
3467 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003468 }
3469 }
3470
Martin v. Löwis412fb672006-04-13 06:34:32 +00003471 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003472 goto onError;
3473 Py_XDECREF(errorHandler);
3474 Py_XDECREF(exc);
3475 return (PyObject *)v;
3476
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003477 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003478 Py_XDECREF(v);
3479 Py_XDECREF(errorHandler);
3480 Py_XDECREF(exc);
3481 return NULL;
3482}
3483
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484/* --- Latin-1 Codec ------------------------------------------------------ */
3485
3486PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003487 Py_ssize_t size,
3488 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489{
3490 PyUnicodeObject *v;
3491 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003492
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003494 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003495 Py_UNICODE r = *(unsigned char*)s;
3496 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003497 }
3498
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499 v = _PyUnicode_New(size);
3500 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003501 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003503 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 p = PyUnicode_AS_UNICODE(v);
3505 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003506 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003508
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003509 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 Py_XDECREF(v);
3511 return NULL;
3512}
3513
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003514/* create or adjust a UnicodeEncodeError */
3515static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003516 const char *encoding,
3517 const Py_UNICODE *unicode, Py_ssize_t size,
3518 Py_ssize_t startpos, Py_ssize_t endpos,
3519 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003522 *exceptionObject = PyUnicodeEncodeError_Create(
3523 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 }
3525 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003526 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3527 goto onError;
3528 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3529 goto onError;
3530 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3531 goto onError;
3532 return;
3533 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02003534 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 }
3536}
3537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538/* raises a UnicodeEncodeError */
3539static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003540 const char *encoding,
3541 const Py_UNICODE *unicode, Py_ssize_t size,
3542 Py_ssize_t startpos, Py_ssize_t endpos,
3543 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544{
3545 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003546 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003548 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549}
3550
3551/* error handling callback helper:
3552 build arguments, call the callback and check the arguments,
3553 put the result into newpos and return the replacement string, which
3554 has to be freed by the caller */
3555static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003556 PyObject **errorHandler,
3557 const char *encoding, const char *reason,
3558 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3559 Py_ssize_t startpos, Py_ssize_t endpos,
3560 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003562 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563
3564 PyObject *restuple;
3565 PyObject *resunicode;
3566
3567 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003568 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003570 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 }
3572
3573 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003574 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003576 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577
3578 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003579 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003581 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003583 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003584 Py_DECREF(restuple);
3585 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 }
3587 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003588 &resunicode, newpos)) {
3589 Py_DECREF(restuple);
3590 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 }
3592 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003593 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003594 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003595 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3596 Py_DECREF(restuple);
3597 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003598 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 Py_INCREF(resunicode);
3600 Py_DECREF(restuple);
3601 return resunicode;
3602}
3603
3604static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003605 Py_ssize_t size,
3606 const char *errors,
3607 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003608{
3609 /* output object */
3610 PyObject *res;
3611 /* pointers to the beginning and end+1 of input */
3612 const Py_UNICODE *startp = p;
3613 const Py_UNICODE *endp = p + size;
3614 /* pointer to the beginning of the unencodable characters */
3615 /* const Py_UNICODE *badp = NULL; */
3616 /* pointer into the output */
3617 char *str;
3618 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003619 Py_ssize_t respos = 0;
3620 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003621 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3622 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 PyObject *errorHandler = NULL;
3624 PyObject *exc = NULL;
3625 /* the following variable is used for caching string comparisons
3626 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3627 int known_errorHandler = -1;
3628
3629 /* allocate enough for a simple encoding without
3630 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003631 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 if (res == NULL)
3633 goto onError;
3634 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003635 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003636 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 ressize = size;
3638
3639 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003640 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003642 /* can we encode this? */
3643 if (c<limit) {
3644 /* no overflow check, because we know that the space is enough */
3645 *str++ = (char)c;
3646 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003647 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003648 else {
3649 Py_ssize_t unicodepos = p-startp;
3650 Py_ssize_t requiredsize;
3651 PyObject *repunicode;
3652 Py_ssize_t repsize;
3653 Py_ssize_t newpos;
3654 Py_ssize_t respos;
3655 Py_UNICODE *uni2;
3656 /* startpos for collecting unencodable chars */
3657 const Py_UNICODE *collstart = p;
3658 const Py_UNICODE *collend = p;
3659 /* find all unecodable characters */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003660 while ((collend < endp) && ((*collend) >= limit))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003661 ++collend;
3662 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3663 if (known_errorHandler==-1) {
3664 if ((errors==NULL) || (!strcmp(errors, "strict")))
3665 known_errorHandler = 1;
3666 else if (!strcmp(errors, "replace"))
3667 known_errorHandler = 2;
3668 else if (!strcmp(errors, "ignore"))
3669 known_errorHandler = 3;
3670 else if (!strcmp(errors, "xmlcharrefreplace"))
3671 known_errorHandler = 4;
3672 else
3673 known_errorHandler = 0;
3674 }
3675 switch (known_errorHandler) {
3676 case 1: /* strict */
3677 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3678 goto onError;
3679 case 2: /* replace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003680 while (collstart++ < collend)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003681 *str++ = '?'; /* fall through */
3682 case 3: /* ignore */
3683 p = collend;
3684 break;
3685 case 4: /* xmlcharrefreplace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003686 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003687 /* determine replacement size (temporarily (mis)uses p) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003688 requiredsize = respos;
3689 for (p = collstart; p < collend;) {
Serhiy Storchakae822b032013-08-06 16:56:26 +03003690 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003691 Py_ssize_t incr;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003692 if (ch < 10)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003693 incr = 2+1+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003694 else if (ch < 100)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003695 incr = 2+2+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003696 else if (ch < 1000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003697 incr = 2+3+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003698 else if (ch < 10000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003699 incr = 2+4+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003700 else if (ch < 100000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003701 incr = 2+5+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003702 else if (ch < 1000000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003703 incr = 2+6+1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003704 else
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003705 incr = 2+7+1;
3706 if (requiredsize > PY_SSIZE_T_MAX - incr)
3707 goto overflow;
3708 requiredsize += incr;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003709 }
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003710 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3711 goto overflow;
3712 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003713 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003714 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003715 requiredsize = 2*ressize;
3716 if (_PyString_Resize(&res, requiredsize))
3717 goto onError;
3718 str = PyString_AS_STRING(res) + respos;
3719 ressize = requiredsize;
3720 }
3721 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03003722 for (p = collstart; p < collend;) {
3723 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3724 str += sprintf(str, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003725 }
3726 p = collend;
3727 break;
3728 default:
3729 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3730 encoding, reason, startp, size, &exc,
3731 collstart-startp, collend-startp, &newpos);
3732 if (repunicode == NULL)
3733 goto onError;
3734 /* need more space? (at least enough for what we have+the
3735 replacement+the rest of the string, so we won't have to
3736 check space for encodable characters) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003737 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003738 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003739 if (respos > PY_SSIZE_T_MAX - repsize)
3740 goto overflow;
3741 requiredsize = respos + repsize;
3742 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3743 goto overflow;
3744 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003745 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003746 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003747 requiredsize = 2*ressize;
3748 if (_PyString_Resize(&res, requiredsize)) {
3749 Py_DECREF(repunicode);
3750 goto onError;
3751 }
3752 str = PyString_AS_STRING(res) + respos;
3753 ressize = requiredsize;
3754 }
3755 /* check if there is anything unencodable in the replacement
3756 and copy it to the output */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003757 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003758 c = *uni2;
3759 if (c >= limit) {
3760 raise_encode_exception(&exc, encoding, startp, size,
3761 unicodepos, unicodepos+1, reason);
3762 Py_DECREF(repunicode);
3763 goto onError;
3764 }
3765 *str = (char)c;
3766 }
3767 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003768 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003769 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003770 }
3771 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 /* Resize if we allocated to much */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003773 respos = str - PyString_AS_STRING(res);
3774 if (respos < ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003775 /* If this falls res will be NULL */
3776 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777 Py_XDECREF(errorHandler);
3778 Py_XDECREF(exc);
3779 return res;
3780
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003781 overflow:
3782 PyErr_SetString(PyExc_OverflowError,
3783 "encoded result is too long for a Python string");
3784
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003785 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 Py_XDECREF(res);
3787 Py_XDECREF(errorHandler);
3788 Py_XDECREF(exc);
3789 return NULL;
3790}
3791
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003793 Py_ssize_t size,
3794 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797}
3798
3799PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3800{
3801 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003802 PyErr_BadArgument();
3803 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 }
3805 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003806 PyUnicode_GET_SIZE(unicode),
3807 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808}
3809
3810/* --- 7-bit ASCII Codec -------------------------------------------------- */
3811
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003813 Py_ssize_t size,
3814 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 PyUnicodeObject *v;
3818 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003819 Py_ssize_t startinpos;
3820 Py_ssize_t endinpos;
3821 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 const char *e;
3823 PyObject *errorHandler = NULL;
3824 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003825
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003827 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003828 Py_UNICODE r = *(unsigned char*)s;
3829 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003830 }
Tim Petersced69f82003-09-16 20:30:58 +00003831
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 v = _PyUnicode_New(size);
3833 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003834 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003836 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838 e = s + size;
3839 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003840 register unsigned char c = (unsigned char)*s;
3841 if (c < 128) {
3842 *p++ = c;
3843 ++s;
3844 }
3845 else {
3846 startinpos = s-starts;
3847 endinpos = startinpos + 1;
3848 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3849 if (unicode_decode_call_errorhandler(
3850 errors, &errorHandler,
3851 "ascii", "ordinal not in range(128)",
3852 starts, size, &startinpos, &endinpos, &exc, &s,
3853 &v, &outpos, &p))
3854 goto onError;
3855 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003857 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003858 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3859 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 Py_XDECREF(errorHandler);
3861 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003863
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003864 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003866 Py_XDECREF(errorHandler);
3867 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 return NULL;
3869}
3870
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003872 Py_ssize_t size,
3873 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003875 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876}
3877
3878PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3879{
3880 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003881 PyErr_BadArgument();
3882 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883 }
3884 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003885 PyUnicode_GET_SIZE(unicode),
3886 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887}
3888
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003889#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003890
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003891/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003892
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003893#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003894#define NEED_RETRY
3895#endif
3896
3897/* XXX This code is limited to "true" double-byte encodings, as
3898 a) it assumes an incomplete character consists of a single byte, and
3899 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003900 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003901
3902static int is_dbcs_lead_byte(const char *s, int offset)
3903{
3904 const char *curr = s + offset;
3905
3906 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003907 const char *prev = CharPrev(s, curr);
3908 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003909 }
3910 return 0;
3911}
3912
3913/*
3914 * Decode MBCS string into unicode object. If 'final' is set, converts
3915 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3916 */
3917static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003918 const char *s, /* MBCS string */
3919 int size, /* sizeof MBCS string */
3920 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003921{
3922 Py_UNICODE *p;
3923 Py_ssize_t n = 0;
3924 int usize = 0;
3925
3926 assert(size >= 0);
3927
3928 /* Skip trailing lead-byte unless 'final' is set */
3929 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003930 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003931
3932 /* First get the size of the result */
3933 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003934 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3935 if (usize == 0) {
3936 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3937 return -1;
3938 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003939 }
3940
3941 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003942 /* Create unicode object */
3943 *v = _PyUnicode_New(usize);
3944 if (*v == NULL)
3945 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003946 }
3947 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003948 /* Extend unicode object */
3949 n = PyUnicode_GET_SIZE(*v);
3950 if (_PyUnicode_Resize(v, n + usize) < 0)
3951 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003952 }
3953
3954 /* Do the conversion */
3955 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003956 p = PyUnicode_AS_UNICODE(*v) + n;
3957 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3958 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3959 return -1;
3960 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003961 }
3962
3963 return size;
3964}
3965
3966PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003967 Py_ssize_t size,
3968 const char *errors,
3969 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003970{
3971 PyUnicodeObject *v = NULL;
3972 int done;
3973
3974 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003975 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003976
3977#ifdef NEED_RETRY
3978 retry:
3979 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003980 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003981 else
3982#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003983 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003984
3985 if (done < 0) {
3986 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003987 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003988 }
3989
3990 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003991 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003992
3993#ifdef NEED_RETRY
3994 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003995 s += done;
3996 size -= done;
3997 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003998 }
3999#endif
4000
4001 return (PyObject *)v;
4002}
4003
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004004PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004005 Py_ssize_t size,
4006 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004007{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004008 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4009}
4010
4011/*
4012 * Convert unicode into string object (MBCS).
4013 * Returns 0 if succeed, -1 otherwise.
4014 */
4015static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004016 const Py_UNICODE *p, /* unicode */
4017 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004018{
4019 int mbcssize = 0;
4020 Py_ssize_t n = 0;
4021
4022 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004023
4024 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004025 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004026 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4027 if (mbcssize == 0) {
4028 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4029 return -1;
4030 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004031 }
4032
Martin v. Löwisd8251432006-06-14 05:21:04 +00004033 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004034 /* Create string object */
4035 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4036 if (*repr == NULL)
4037 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004038 }
4039 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004040 /* Extend string object */
4041 n = PyString_Size(*repr);
4042 if (_PyString_Resize(repr, n + mbcssize) < 0)
4043 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004044 }
4045
4046 /* Do the conversion */
4047 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004048 char *s = PyString_AS_STRING(*repr) + n;
4049 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4050 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4051 return -1;
4052 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004053 }
4054
4055 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004056}
4057
4058PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004059 Py_ssize_t size,
4060 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004061{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004062 PyObject *repr = NULL;
4063 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004064
Martin v. Löwisd8251432006-06-14 05:21:04 +00004065#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004066 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004067 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004068 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004069 else
4070#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004071 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004072
Martin v. Löwisd8251432006-06-14 05:21:04 +00004073 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004074 Py_XDECREF(repr);
4075 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004076 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004077
4078#ifdef NEED_RETRY
4079 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004080 p += INT_MAX;
4081 size -= INT_MAX;
4082 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004083 }
4084#endif
4085
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004086 return repr;
4087}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004088
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004089PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4090{
4091 if (!PyUnicode_Check(unicode)) {
4092 PyErr_BadArgument();
4093 return NULL;
4094 }
4095 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004096 PyUnicode_GET_SIZE(unicode),
4097 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004098}
4099
Martin v. Löwisd8251432006-06-14 05:21:04 +00004100#undef NEED_RETRY
4101
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004102#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004103
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104/* --- Character Mapping Codec -------------------------------------------- */
4105
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004107 Py_ssize_t size,
4108 PyObject *mapping,
4109 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004111 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004112 Py_ssize_t startinpos;
4113 Py_ssize_t endinpos;
4114 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 PyUnicodeObject *v;
4117 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004118 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 PyObject *errorHandler = NULL;
4120 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004121 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004122 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004123
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 /* Default to Latin-1 */
4125 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004126 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127
4128 v = _PyUnicode_New(size);
4129 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004130 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004132 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004135 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004136 mapstring = PyUnicode_AS_UNICODE(mapping);
4137 maplen = PyUnicode_GET_SIZE(mapping);
4138 while (s < e) {
4139 unsigned char ch = *s;
4140 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004142 if (ch < maplen)
4143 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004145 if (x == 0xfffe) {
4146 /* undefined mapping */
4147 outpos = p-PyUnicode_AS_UNICODE(v);
4148 startinpos = s-starts;
4149 endinpos = startinpos+1;
4150 if (unicode_decode_call_errorhandler(
4151 errors, &errorHandler,
4152 "charmap", "character maps to <undefined>",
4153 starts, size, &startinpos, &endinpos, &exc, &s,
4154 &v, &outpos, &p)) {
4155 goto onError;
4156 }
4157 continue;
4158 }
4159 *p++ = x;
4160 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004161 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004162 }
4163 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004164 while (s < e) {
4165 unsigned char ch = *s;
4166 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004167
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004168 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4169 w = PyInt_FromLong((long)ch);
4170 if (w == NULL)
4171 goto onError;
4172 x = PyObject_GetItem(mapping, w);
4173 Py_DECREF(w);
4174 if (x == NULL) {
4175 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4176 /* No mapping found means: mapping is undefined. */
4177 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004178 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004179 } else
4180 goto onError;
4181 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004182
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004183 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004184 if (x == Py_None)
4185 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004186 if (PyInt_Check(x)) {
4187 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004188 if (value == 0xFFFE)
4189 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004190 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004191 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004192 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004193 Py_DECREF(x);
4194 goto onError;
4195 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004196
4197#ifndef Py_UNICODE_WIDE
4198 if (value > 0xFFFF) {
4199 /* see the code for 1-n mapping below */
4200 if (extrachars < 2) {
4201 /* resize first */
4202 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4203 Py_ssize_t needed = 10 - extrachars;
4204 extrachars += needed;
4205 /* XXX overflow detection missing */
4206 if (_PyUnicode_Resize(&v,
4207 PyUnicode_GET_SIZE(v) + needed) < 0) {
4208 Py_DECREF(x);
4209 goto onError;
4210 }
4211 p = PyUnicode_AS_UNICODE(v) + oldpos;
4212 }
4213 value -= 0x10000;
4214 *p++ = 0xD800 | (value >> 10);
4215 *p++ = 0xDC00 | (value & 0x3FF);
4216 extrachars -= 2;
4217 }
4218 else
4219#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004220 *p++ = (Py_UNICODE)value;
4221 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004222 else if (PyUnicode_Check(x)) {
4223 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004224
Serhiy Storchaka95997452013-01-15 14:42:59 +02004225 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004226 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004227 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4228 if (value == 0xFFFE)
4229 goto Undefined;
4230 *p++ = value;
4231 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004232 else if (targetsize > 1) {
4233 /* 1-n mapping */
4234 if (targetsize > extrachars) {
4235 /* resize first */
4236 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4237 Py_ssize_t needed = (targetsize - extrachars) + \
4238 (targetsize << 2);
4239 extrachars += needed;
4240 /* XXX overflow detection missing */
4241 if (_PyUnicode_Resize(&v,
4242 PyUnicode_GET_SIZE(v) + needed) < 0) {
4243 Py_DECREF(x);
4244 goto onError;
4245 }
4246 p = PyUnicode_AS_UNICODE(v) + oldpos;
4247 }
4248 Py_UNICODE_COPY(p,
4249 PyUnicode_AS_UNICODE(x),
4250 targetsize);
4251 p += targetsize;
4252 extrachars -= targetsize;
4253 }
4254 /* 1-0 mapping: skip the character */
4255 }
4256 else {
4257 /* wrong return value */
4258 PyErr_SetString(PyExc_TypeError,
4259 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004260 Py_DECREF(x);
4261 goto onError;
4262 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004263 Py_DECREF(x);
4264 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004265 continue;
4266Undefined:
4267 /* undefined mapping */
4268 Py_XDECREF(x);
4269 outpos = p-PyUnicode_AS_UNICODE(v);
4270 startinpos = s-starts;
4271 endinpos = startinpos+1;
4272 if (unicode_decode_call_errorhandler(
4273 errors, &errorHandler,
4274 "charmap", "character maps to <undefined>",
4275 starts, size, &startinpos, &endinpos, &exc, &s,
4276 &v, &outpos, &p)) {
4277 goto onError;
4278 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004279 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 }
4281 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004282 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4283 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284 Py_XDECREF(errorHandler);
4285 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004287
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004288 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 Py_XDECREF(errorHandler);
4290 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 Py_XDECREF(v);
4292 return NULL;
4293}
4294
Martin v. Löwis3f767792006-06-04 19:36:28 +00004295/* Charmap encoding: the lookup table */
4296
4297struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004298 PyObject_HEAD
4299 unsigned char level1[32];
4300 int count2, count3;
4301 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004302};
4303
4304static PyObject*
4305encoding_map_size(PyObject *obj, PyObject* args)
4306{
4307 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004308 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004309 128*map->count3);
4310}
4311
4312static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004313 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004314 PyDoc_STR("Return the size (in bytes) of this object") },
4315 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004316};
4317
4318static void
4319encoding_map_dealloc(PyObject* o)
4320{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004321 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004322}
4323
4324static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004325 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004326 "EncodingMap", /*tp_name*/
4327 sizeof(struct encoding_map), /*tp_basicsize*/
4328 0, /*tp_itemsize*/
4329 /* methods */
4330 encoding_map_dealloc, /*tp_dealloc*/
4331 0, /*tp_print*/
4332 0, /*tp_getattr*/
4333 0, /*tp_setattr*/
4334 0, /*tp_compare*/
4335 0, /*tp_repr*/
4336 0, /*tp_as_number*/
4337 0, /*tp_as_sequence*/
4338 0, /*tp_as_mapping*/
4339 0, /*tp_hash*/
4340 0, /*tp_call*/
4341 0, /*tp_str*/
4342 0, /*tp_getattro*/
4343 0, /*tp_setattro*/
4344 0, /*tp_as_buffer*/
4345 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4346 0, /*tp_doc*/
4347 0, /*tp_traverse*/
4348 0, /*tp_clear*/
4349 0, /*tp_richcompare*/
4350 0, /*tp_weaklistoffset*/
4351 0, /*tp_iter*/
4352 0, /*tp_iternext*/
4353 encoding_map_methods, /*tp_methods*/
4354 0, /*tp_members*/
4355 0, /*tp_getset*/
4356 0, /*tp_base*/
4357 0, /*tp_dict*/
4358 0, /*tp_descr_get*/
4359 0, /*tp_descr_set*/
4360 0, /*tp_dictoffset*/
4361 0, /*tp_init*/
4362 0, /*tp_alloc*/
4363 0, /*tp_new*/
4364 0, /*tp_free*/
4365 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004366};
4367
4368PyObject*
4369PyUnicode_BuildEncodingMap(PyObject* string)
4370{
4371 Py_UNICODE *decode;
4372 PyObject *result;
4373 struct encoding_map *mresult;
4374 int i;
4375 int need_dict = 0;
4376 unsigned char level1[32];
4377 unsigned char level2[512];
4378 unsigned char *mlevel1, *mlevel2, *mlevel3;
4379 int count2 = 0, count3 = 0;
4380
4381 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4382 PyErr_BadArgument();
4383 return NULL;
4384 }
4385 decode = PyUnicode_AS_UNICODE(string);
4386 memset(level1, 0xFF, sizeof level1);
4387 memset(level2, 0xFF, sizeof level2);
4388
4389 /* If there isn't a one-to-one mapping of NULL to \0,
4390 or if there are non-BMP characters, we need to use
4391 a mapping dictionary. */
4392 if (decode[0] != 0)
4393 need_dict = 1;
4394 for (i = 1; i < 256; i++) {
4395 int l1, l2;
4396 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004397#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004398 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004399#endif
4400 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004401 need_dict = 1;
4402 break;
4403 }
4404 if (decode[i] == 0xFFFE)
4405 /* unmapped character */
4406 continue;
4407 l1 = decode[i] >> 11;
4408 l2 = decode[i] >> 7;
4409 if (level1[l1] == 0xFF)
4410 level1[l1] = count2++;
4411 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004412 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004413 }
4414
4415 if (count2 >= 0xFF || count3 >= 0xFF)
4416 need_dict = 1;
4417
4418 if (need_dict) {
4419 PyObject *result = PyDict_New();
4420 PyObject *key, *value;
4421 if (!result)
4422 return NULL;
4423 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004424 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004425 key = PyInt_FromLong(decode[i]);
4426 value = PyInt_FromLong(i);
4427 if (!key || !value)
4428 goto failed1;
4429 if (PyDict_SetItem(result, key, value) == -1)
4430 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004431 Py_DECREF(key);
4432 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004433 }
4434 return result;
4435 failed1:
4436 Py_XDECREF(key);
4437 Py_XDECREF(value);
4438 Py_DECREF(result);
4439 return NULL;
4440 }
4441
4442 /* Create a three-level trie */
4443 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4444 16*count2 + 128*count3 - 1);
4445 if (!result)
4446 return PyErr_NoMemory();
4447 PyObject_Init(result, &EncodingMapType);
4448 mresult = (struct encoding_map*)result;
4449 mresult->count2 = count2;
4450 mresult->count3 = count3;
4451 mlevel1 = mresult->level1;
4452 mlevel2 = mresult->level23;
4453 mlevel3 = mresult->level23 + 16*count2;
4454 memcpy(mlevel1, level1, 32);
4455 memset(mlevel2, 0xFF, 16*count2);
4456 memset(mlevel3, 0, 128*count3);
4457 count3 = 0;
4458 for (i = 1; i < 256; i++) {
4459 int o1, o2, o3, i2, i3;
4460 if (decode[i] == 0xFFFE)
4461 /* unmapped character */
4462 continue;
4463 o1 = decode[i]>>11;
4464 o2 = (decode[i]>>7) & 0xF;
4465 i2 = 16*mlevel1[o1] + o2;
4466 if (mlevel2[i2] == 0xFF)
4467 mlevel2[i2] = count3++;
4468 o3 = decode[i] & 0x7F;
4469 i3 = 128*mlevel2[i2] + o3;
4470 mlevel3[i3] = i;
4471 }
4472 return result;
4473}
4474
4475static int
4476encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4477{
4478 struct encoding_map *map = (struct encoding_map*)mapping;
4479 int l1 = c>>11;
4480 int l2 = (c>>7) & 0xF;
4481 int l3 = c & 0x7F;
4482 int i;
4483
4484#ifdef Py_UNICODE_WIDE
4485 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004486 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004487 }
4488#endif
4489 if (c == 0)
4490 return 0;
4491 /* level 1*/
4492 i = map->level1[l1];
4493 if (i == 0xFF) {
4494 return -1;
4495 }
4496 /* level 2*/
4497 i = map->level23[16*i+l2];
4498 if (i == 0xFF) {
4499 return -1;
4500 }
4501 /* level 3 */
4502 i = map->level23[16*map->count2 + 128*i + l3];
4503 if (i == 0) {
4504 return -1;
4505 }
4506 return i;
4507}
4508
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509/* Lookup the character ch in the mapping. If the character
4510 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004511 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 PyObject *w = PyInt_FromLong((long)c);
4515 PyObject *x;
4516
4517 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004518 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 x = PyObject_GetItem(mapping, w);
4520 Py_DECREF(w);
4521 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004522 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4523 /* No mapping found means: mapping is undefined. */
4524 PyErr_Clear();
4525 x = Py_None;
4526 Py_INCREF(x);
4527 return x;
4528 } else
4529 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004531 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004532 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004534 long value = PyInt_AS_LONG(x);
4535 if (value < 0 || value > 255) {
4536 PyErr_SetString(PyExc_TypeError,
4537 "character mapping must be in range(256)");
4538 Py_DECREF(x);
4539 return NULL;
4540 }
4541 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004543 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004544 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004546 /* wrong return value */
4547 PyErr_SetString(PyExc_TypeError,
4548 "character mapping must return integer, None or str");
4549 Py_DECREF(x);
4550 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551 }
4552}
4553
Martin v. Löwis3f767792006-06-04 19:36:28 +00004554static int
4555charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4556{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004557 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4558 /* exponentially overallocate to minimize reallocations */
4559 if (requiredsize < 2*outsize)
4560 requiredsize = 2*outsize;
4561 if (_PyString_Resize(outobj, requiredsize)) {
4562 return 0;
4563 }
4564 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004565}
4566
Benjamin Peterson857ce152009-01-31 16:29:18 +00004567typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004568 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004569}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570/* lookup the character, put the result in the output string and adjust
4571 various state variables. Reallocate the output string if not enough
4572 space is available. Return a new reference to the object that
4573 was put in the output buffer, or Py_None, if the mapping was undefined
4574 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004575 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004577charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004578 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004580 PyObject *rep;
4581 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004582 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583
Christian Heimese93237d2007-12-19 02:37:44 +00004584 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004585 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004586 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004587 if (res == -1)
4588 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004589 if (outsize<requiredsize)
4590 if (!charmapencode_resize(outobj, outpos, requiredsize))
4591 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004592 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004593 outstart[(*outpos)++] = (char)res;
4594 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004595 }
4596
4597 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004599 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004600 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004601 Py_DECREF(rep);
4602 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004603 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004604 if (PyInt_Check(rep)) {
4605 Py_ssize_t requiredsize = *outpos+1;
4606 if (outsize<requiredsize)
4607 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4608 Py_DECREF(rep);
4609 return enc_EXCEPTION;
4610 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004611 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004612 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004613 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004614 else {
4615 const char *repchars = PyString_AS_STRING(rep);
4616 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4617 Py_ssize_t requiredsize = *outpos+repsize;
4618 if (outsize<requiredsize)
4619 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4620 Py_DECREF(rep);
4621 return enc_EXCEPTION;
4622 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004623 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004624 memcpy(outstart + *outpos, repchars, repsize);
4625 *outpos += repsize;
4626 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627 }
Georg Brandl9f167602006-06-04 21:46:16 +00004628 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004629 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004630}
4631
4632/* handle an error in PyUnicode_EncodeCharmap
4633 Return 0 on success, -1 on error */
4634static
4635int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004636 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004638 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004639 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640{
4641 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004642 Py_ssize_t repsize;
4643 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 Py_UNICODE *uni2;
4645 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004646 Py_ssize_t collstartpos = *inpos;
4647 Py_ssize_t collendpos = *inpos+1;
4648 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 char *encoding = "charmap";
4650 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004651 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 /* find all unencodable characters */
4654 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004655 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004656 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004657 int res = encoding_map_lookup(p[collendpos], mapping);
4658 if (res != -1)
4659 break;
4660 ++collendpos;
4661 continue;
4662 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004663
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004664 rep = charmapencode_lookup(p[collendpos], mapping);
4665 if (rep==NULL)
4666 return -1;
4667 else if (rep!=Py_None) {
4668 Py_DECREF(rep);
4669 break;
4670 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004671 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004672 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673 }
4674 /* cache callback name lookup
4675 * (if not done yet, i.e. it's the first error) */
4676 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004677 if ((errors==NULL) || (!strcmp(errors, "strict")))
4678 *known_errorHandler = 1;
4679 else if (!strcmp(errors, "replace"))
4680 *known_errorHandler = 2;
4681 else if (!strcmp(errors, "ignore"))
4682 *known_errorHandler = 3;
4683 else if (!strcmp(errors, "xmlcharrefreplace"))
4684 *known_errorHandler = 4;
4685 else
4686 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687 }
4688 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004689 case 1: /* strict */
4690 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4691 return -1;
4692 case 2: /* replace */
4693 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004694 x = charmapencode_output('?', mapping, res, respos);
4695 if (x==enc_EXCEPTION) {
4696 return -1;
4697 }
4698 else if (x==enc_FAILED) {
4699 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4700 return -1;
4701 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004702 }
4703 /* fall through */
4704 case 3: /* ignore */
4705 *inpos = collendpos;
4706 break;
4707 case 4: /* xmlcharrefreplace */
Serhiy Storchakae822b032013-08-06 16:56:26 +03004708 /* generate replacement */
4709 for (collpos = collstartpos; collpos < collendpos;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004710 char buffer[2+29+1+1];
4711 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03004712 Py_UCS4 ch = p[collpos++];
4713#ifndef Py_UNICODE_WIDE
4714 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4715 (collpos < collendpos) &&
4716 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4717 ch = ((((ch & 0x03FF) << 10) |
4718 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4719 }
4720#endif
4721 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004722 for (cp = buffer; *cp; ++cp) {
4723 x = charmapencode_output(*cp, mapping, res, respos);
4724 if (x==enc_EXCEPTION)
4725 return -1;
4726 else if (x==enc_FAILED) {
4727 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4728 return -1;
4729 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004730 }
4731 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004732 *inpos = collendpos;
4733 break;
4734 default:
4735 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004736 encoding, reason, p, size, exceptionObject,
4737 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004738 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004739 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004740 /* generate replacement */
4741 repsize = PyUnicode_GET_SIZE(repunicode);
4742 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004743 x = charmapencode_output(*uni2, mapping, res, respos);
4744 if (x==enc_EXCEPTION) {
4745 return -1;
4746 }
4747 else if (x==enc_FAILED) {
4748 Py_DECREF(repunicode);
4749 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4750 return -1;
4751 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004752 }
4753 *inpos = newpos;
4754 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755 }
4756 return 0;
4757}
4758
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004760 Py_ssize_t size,
4761 PyObject *mapping,
4762 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004764 /* output object */
4765 PyObject *res = NULL;
4766 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004767 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004769 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 PyObject *errorHandler = NULL;
4771 PyObject *exc = NULL;
4772 /* the following variable is used for caching string comparisons
4773 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4774 * 3=ignore, 4=xmlcharrefreplace */
4775 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776
4777 /* Default to Latin-1 */
4778 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004779 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781 /* allocate enough for a simple encoding without
4782 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004783 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004784 if (res == NULL)
4785 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004786 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004787 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004790 /* try to encode it */
4791 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4792 if (x==enc_EXCEPTION) /* error */
4793 goto onError;
4794 if (x==enc_FAILED) { /* unencodable character */
4795 if (charmap_encoding_error(p, size, &inpos, mapping,
4796 &exc,
4797 &known_errorHandler, &errorHandler, errors,
4798 &res, &respos)) {
4799 goto onError;
4800 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004801 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004802 else
4803 /* done with this character => adjust input position */
4804 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004808 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004809 if (_PyString_Resize(&res, respos))
4810 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 }
4812 Py_XDECREF(exc);
4813 Py_XDECREF(errorHandler);
4814 return res;
4815
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004816 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817 Py_XDECREF(res);
4818 Py_XDECREF(exc);
4819 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 return NULL;
4821}
4822
4823PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004824 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825{
4826 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004827 PyErr_BadArgument();
4828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 }
4830 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004831 PyUnicode_GET_SIZE(unicode),
4832 mapping,
4833 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834}
4835
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004836/* create or adjust a UnicodeTranslateError */
4837static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004838 const Py_UNICODE *unicode, Py_ssize_t size,
4839 Py_ssize_t startpos, Py_ssize_t endpos,
4840 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004843 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004844 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 }
4846 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004847 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4848 goto onError;
4849 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4850 goto onError;
4851 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4852 goto onError;
4853 return;
4854 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02004855 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 }
4857}
4858
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004859/* raises a UnicodeTranslateError */
4860static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004861 const Py_UNICODE *unicode, Py_ssize_t size,
4862 Py_ssize_t startpos, Py_ssize_t endpos,
4863 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864{
4865 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004866 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004868 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869}
4870
4871/* error handling callback helper:
4872 build arguments, call the callback and check the arguments,
4873 put the result into newpos and return the replacement string, which
4874 has to be freed by the caller */
4875static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004876 PyObject **errorHandler,
4877 const char *reason,
4878 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4879 Py_ssize_t startpos, Py_ssize_t endpos,
4880 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004881{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004882 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883
Martin v. Löwis412fb672006-04-13 06:34:32 +00004884 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885 PyObject *restuple;
4886 PyObject *resunicode;
4887
4888 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004889 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004890 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004891 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892 }
4893
4894 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004895 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004897 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898
4899 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004900 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004901 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004902 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004904 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004905 Py_DECREF(restuple);
4906 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 }
4908 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004909 &resunicode, &i_newpos)) {
4910 Py_DECREF(restuple);
4911 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004912 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004913 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004914 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004915 else
4916 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004917 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004918 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4919 Py_DECREF(restuple);
4920 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004921 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 Py_INCREF(resunicode);
4923 Py_DECREF(restuple);
4924 return resunicode;
4925}
4926
4927/* Lookup the character ch in the mapping and put the result in result,
4928 which must be decrefed by the caller.
4929 Return 0 on success, -1 on error */
4930static
4931int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4932{
4933 PyObject *w = PyInt_FromLong((long)c);
4934 PyObject *x;
4935
4936 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004937 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004938 x = PyObject_GetItem(mapping, w);
4939 Py_DECREF(w);
4940 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004941 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4942 /* No mapping found means: use 1:1 mapping. */
4943 PyErr_Clear();
4944 *result = NULL;
4945 return 0;
4946 } else
4947 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004948 }
4949 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004950 *result = x;
4951 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952 }
4953 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004954 long value = PyInt_AS_LONG(x);
4955 long max = PyUnicode_GetMax();
4956 if (value < 0 || value > max) {
4957 PyErr_Format(PyExc_TypeError,
4958 "character mapping must be in range(0x%lx)", max+1);
4959 Py_DECREF(x);
4960 return -1;
4961 }
4962 *result = x;
4963 return 0;
4964 }
4965 else if (PyUnicode_Check(x)) {
4966 *result = x;
4967 return 0;
4968 }
4969 else {
4970 /* wrong return value */
4971 PyErr_SetString(PyExc_TypeError,
4972 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004973 Py_DECREF(x);
4974 return -1;
4975 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004976}
4977/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004978 if not reallocate and adjust various state variables.
4979 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004980static
Walter Dörwald4894c302003-10-24 14:25:28 +00004981int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004982 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004983{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004984 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004985 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004986 /* remember old output position */
4987 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4988 /* exponentially overallocate to minimize reallocations */
4989 if (requiredsize < 2 * oldsize)
4990 requiredsize = 2 * oldsize;
4991 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4992 return -1;
4993 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 }
4995 return 0;
4996}
4997/* lookup the character, put the result in the output string and adjust
4998 various state variables. Return a new reference to the object that
4999 was put in the output buffer in *result, or Py_None, if the mapping was
5000 undefined (in which case no character was written).
5001 The called must decref result.
5002 Return 0 on success, -1 on error. */
5003static
Walter Dörwald4894c302003-10-24 14:25:28 +00005004int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005005 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5006 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007{
Walter Dörwald4894c302003-10-24 14:25:28 +00005008 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005009 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005010 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005011 /* not found => default to 1:1 mapping */
5012 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005013 }
5014 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005015 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005016 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005017 /* no overflow check, because we know that the space is enough */
5018 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005019 }
5020 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005021 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5022 if (repsize==1) {
5023 /* no overflow check, because we know that the space is enough */
5024 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5025 }
5026 else if (repsize!=0) {
5027 /* more than one character */
5028 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5029 (insize - (curinp-startinp)) +
5030 repsize - 1;
5031 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5032 return -1;
5033 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5034 *outp += repsize;
5035 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005036 }
5037 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005038 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005039 return 0;
5040}
5041
5042PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005043 Py_ssize_t size,
5044 PyObject *mapping,
5045 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005047 /* output object */
5048 PyObject *res = NULL;
5049 /* pointers to the beginning and end+1 of input */
5050 const Py_UNICODE *startp = p;
5051 const Py_UNICODE *endp = p + size;
5052 /* pointer into the output */
5053 Py_UNICODE *str;
5054 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005055 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005056 char *reason = "character maps to <undefined>";
5057 PyObject *errorHandler = NULL;
5058 PyObject *exc = NULL;
5059 /* the following variable is used for caching string comparisons
5060 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5061 * 3=ignore, 4=xmlcharrefreplace */
5062 int known_errorHandler = -1;
5063
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005065 PyErr_BadArgument();
5066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005068
5069 /* allocate enough for a simple 1:1 translation without
5070 replacements, if we need more, we'll resize */
5071 res = PyUnicode_FromUnicode(NULL, size);
5072 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005073 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005075 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005076 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005078 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005079 /* try to encode it */
5080 PyObject *x = NULL;
5081 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5082 Py_XDECREF(x);
5083 goto onError;
5084 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005085 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005086 if (x!=Py_None) /* it worked => adjust input pointer */
5087 ++p;
5088 else { /* untranslatable character */
5089 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5090 Py_ssize_t repsize;
5091 Py_ssize_t newpos;
5092 Py_UNICODE *uni2;
5093 /* startpos for collecting untranslatable chars */
5094 const Py_UNICODE *collstart = p;
5095 const Py_UNICODE *collend = p+1;
5096 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005098 /* find all untranslatable characters */
5099 while (collend < endp) {
5100 if (charmaptranslate_lookup(*collend, mapping, &x))
5101 goto onError;
5102 Py_XDECREF(x);
5103 if (x!=Py_None)
5104 break;
5105 ++collend;
5106 }
5107 /* cache callback name lookup
5108 * (if not done yet, i.e. it's the first error) */
5109 if (known_errorHandler==-1) {
5110 if ((errors==NULL) || (!strcmp(errors, "strict")))
5111 known_errorHandler = 1;
5112 else if (!strcmp(errors, "replace"))
5113 known_errorHandler = 2;
5114 else if (!strcmp(errors, "ignore"))
5115 known_errorHandler = 3;
5116 else if (!strcmp(errors, "xmlcharrefreplace"))
5117 known_errorHandler = 4;
5118 else
5119 known_errorHandler = 0;
5120 }
5121 switch (known_errorHandler) {
5122 case 1: /* strict */
5123 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005124 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005125 case 2: /* replace */
5126 /* No need to check for space, this is a 1:1 replacement */
5127 for (coll = collstart; coll<collend; ++coll)
5128 *str++ = '?';
5129 /* fall through */
5130 case 3: /* ignore */
5131 p = collend;
5132 break;
5133 case 4: /* xmlcharrefreplace */
5134 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005135 for (p = collstart; p < collend;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005136 char buffer[2+29+1+1];
5137 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03005138 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5139 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005140 if (charmaptranslate_makespace(&res, &str,
5141 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5142 goto onError;
5143 for (cp = buffer; *cp; ++cp)
5144 *str++ = *cp;
5145 }
5146 p = collend;
5147 break;
5148 default:
5149 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5150 reason, startp, size, &exc,
5151 collstart-startp, collend-startp, &newpos);
5152 if (repunicode == NULL)
5153 goto onError;
5154 /* generate replacement */
5155 repsize = PyUnicode_GET_SIZE(repunicode);
5156 if (charmaptranslate_makespace(&res, &str,
5157 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5158 Py_DECREF(repunicode);
5159 goto onError;
5160 }
5161 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5162 *str++ = *uni2;
5163 p = startp + newpos;
5164 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005165 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005166 }
5167 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005168 /* Resize if we allocated to much */
5169 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005170 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005171 if (PyUnicode_Resize(&res, respos) < 0)
5172 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005173 }
5174 Py_XDECREF(exc);
5175 Py_XDECREF(errorHandler);
5176 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005178 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 Py_XDECREF(res);
5180 Py_XDECREF(exc);
5181 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 return NULL;
5183}
5184
5185PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005186 PyObject *mapping,
5187 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188{
5189 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005190
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 str = PyUnicode_FromObject(str);
5192 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005193 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005195 PyUnicode_GET_SIZE(str),
5196 mapping,
5197 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 Py_DECREF(str);
5199 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005200
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005201 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 Py_XDECREF(str);
5203 return NULL;
5204}
Tim Petersced69f82003-09-16 20:30:58 +00005205
Guido van Rossum9e896b32000-04-05 20:11:21 +00005206/* --- Decimal Encoder ---------------------------------------------------- */
5207
5208int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005209 Py_ssize_t length,
5210 char *output,
5211 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005212{
5213 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 PyObject *errorHandler = NULL;
5215 PyObject *exc = NULL;
5216 const char *encoding = "decimal";
5217 const char *reason = "invalid decimal Unicode string";
5218 /* the following variable is used for caching string comparisons
5219 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5220 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005221
5222 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005223 PyErr_BadArgument();
5224 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005225 }
5226
5227 p = s;
5228 end = s + length;
5229 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005230 register Py_UNICODE ch = *p;
5231 int decimal;
5232 PyObject *repunicode;
5233 Py_ssize_t repsize;
5234 Py_ssize_t newpos;
5235 Py_UNICODE *uni2;
5236 Py_UNICODE *collstart;
5237 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005238
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005239 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005240 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005241 ++p;
5242 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005243 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005244 decimal = Py_UNICODE_TODECIMAL(ch);
5245 if (decimal >= 0) {
5246 *output++ = '0' + decimal;
5247 ++p;
5248 continue;
5249 }
5250 if (0 < ch && ch < 256) {
5251 *output++ = (char)ch;
5252 ++p;
5253 continue;
5254 }
5255 /* All other characters are considered unencodable */
5256 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005257 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005258 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005259 Py_UNICODE_ISSPACE(*collend) ||
5260 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005261 break;
5262 }
5263 /* cache callback name lookup
5264 * (if not done yet, i.e. it's the first error) */
5265 if (known_errorHandler==-1) {
5266 if ((errors==NULL) || (!strcmp(errors, "strict")))
5267 known_errorHandler = 1;
5268 else if (!strcmp(errors, "replace"))
5269 known_errorHandler = 2;
5270 else if (!strcmp(errors, "ignore"))
5271 known_errorHandler = 3;
5272 else if (!strcmp(errors, "xmlcharrefreplace"))
5273 known_errorHandler = 4;
5274 else
5275 known_errorHandler = 0;
5276 }
5277 switch (known_errorHandler) {
5278 case 1: /* strict */
5279 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5280 goto onError;
5281 case 2: /* replace */
5282 for (p = collstart; p < collend; ++p)
5283 *output++ = '?';
5284 /* fall through */
5285 case 3: /* ignore */
5286 p = collend;
5287 break;
5288 case 4: /* xmlcharrefreplace */
5289 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005290 for (p = collstart; p < collend;) {
5291 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5292 output += sprintf(output, "&#%d;", ch);
5293 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005294 p = collend;
5295 break;
5296 default:
5297 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5298 encoding, reason, s, length, &exc,
5299 collstart-s, collend-s, &newpos);
5300 if (repunicode == NULL)
5301 goto onError;
5302 /* generate replacement */
5303 repsize = PyUnicode_GET_SIZE(repunicode);
5304 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5305 Py_UNICODE ch = *uni2;
5306 if (Py_UNICODE_ISSPACE(ch))
5307 *output++ = ' ';
5308 else {
5309 decimal = Py_UNICODE_TODECIMAL(ch);
5310 if (decimal >= 0)
5311 *output++ = '0' + decimal;
5312 else if (0 < ch && ch < 256)
5313 *output++ = (char)ch;
5314 else {
5315 Py_DECREF(repunicode);
5316 raise_encode_exception(&exc, encoding,
5317 s, length, collstart-s, collend-s, reason);
5318 goto onError;
5319 }
5320 }
5321 }
5322 p = s + newpos;
5323 Py_DECREF(repunicode);
5324 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005325 }
5326 /* 0-terminate the output string */
5327 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005328 Py_XDECREF(exc);
5329 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005330 return 0;
5331
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005332 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005333 Py_XDECREF(exc);
5334 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005335 return -1;
5336}
5337
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338/* --- Helpers ------------------------------------------------------------ */
5339
Eric Smitha9f7d622008-02-17 19:46:49 +00005340#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005341#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005342
5343#include "stringlib/count.h"
5344#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005345#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005346#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005347
Fredrik Lundhc8162812006-05-26 19:33:03 +00005348/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005349#define ADJUST_INDICES(start, end, len) \
5350 if (end > len) \
5351 end = len; \
5352 else if (end < 0) { \
5353 end += len; \
5354 if (end < 0) \
5355 end = 0; \
5356 } \
5357 if (start < 0) { \
5358 start += len; \
5359 if (start < 0) \
5360 start = 0; \
5361 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005362
Martin v. Löwis18e16552006-02-15 17:27:45 +00005363Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005364 PyObject *substr,
5365 Py_ssize_t start,
5366 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005368 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005369 PyUnicodeObject* str_obj;
5370 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005371
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005372 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5373 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005374 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005375 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5376 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005377 Py_DECREF(str_obj);
5378 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 }
Tim Petersced69f82003-09-16 20:30:58 +00005380
Antoine Pitrou64672132010-01-13 07:55:48 +00005381 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005382 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005383 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5384 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005385 );
5386
5387 Py_DECREF(sub_obj);
5388 Py_DECREF(str_obj);
5389
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 return result;
5391}
5392
Martin v. Löwis18e16552006-02-15 17:27:45 +00005393Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005394 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005395 Py_ssize_t start,
5396 Py_ssize_t end,
5397 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005399 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005400
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005401 str = PyUnicode_FromObject(str);
5402 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005403 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005404 sub = PyUnicode_FromObject(sub);
5405 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005406 Py_DECREF(str);
5407 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 }
Tim Petersced69f82003-09-16 20:30:58 +00005409
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005410 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005411 result = stringlib_find_slice(
5412 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5413 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5414 start, end
5415 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005416 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005417 result = stringlib_rfind_slice(
5418 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5419 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5420 start, end
5421 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005422
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005423 Py_DECREF(str);
5424 Py_DECREF(sub);
5425
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 return result;
5427}
5428
Tim Petersced69f82003-09-16 20:30:58 +00005429static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005431 PyUnicodeObject *substring,
5432 Py_ssize_t start,
5433 Py_ssize_t end,
5434 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 if (substring->length == 0)
5437 return 1;
5438
Antoine Pitrou64672132010-01-13 07:55:48 +00005439 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 end -= substring->length;
5441 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005442 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443
5444 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005445 if (Py_UNICODE_MATCH(self, end, substring))
5446 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 } else {
5448 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005449 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 }
5451
5452 return 0;
5453}
5454
Martin v. Löwis18e16552006-02-15 17:27:45 +00005455Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005456 PyObject *substr,
5457 Py_ssize_t start,
5458 Py_ssize_t end,
5459 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005461 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005462
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 str = PyUnicode_FromObject(str);
5464 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005465 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 substr = PyUnicode_FromObject(substr);
5467 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005468 Py_DECREF(str);
5469 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 }
Tim Petersced69f82003-09-16 20:30:58 +00005471
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005473 (PyUnicodeObject *)substr,
5474 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 Py_DECREF(str);
5476 Py_DECREF(substr);
5477 return result;
5478}
5479
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480/* Apply fixfct filter to the Unicode object self and return a
5481 reference to the modified object */
5482
Tim Petersced69f82003-09-16 20:30:58 +00005483static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005485 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486{
5487
5488 PyUnicodeObject *u;
5489
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005490 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005492 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005493
5494 Py_UNICODE_COPY(u->str, self->str, self->length);
5495
Tim Peters7a29bd52001-09-12 03:03:31 +00005496 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005497 /* fixfct should return TRUE if it modified the buffer. If
5498 FALSE, return a reference to the original buffer instead
5499 (to save space, not time) */
5500 Py_INCREF(self);
5501 Py_DECREF(u);
5502 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 }
5504 return (PyObject*) u;
5505}
5506
Tim Petersced69f82003-09-16 20:30:58 +00005507static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508int fixupper(PyUnicodeObject *self)
5509{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005510 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 Py_UNICODE *s = self->str;
5512 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005513
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005515 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005516
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005517 ch = Py_UNICODE_TOUPPER(*s);
5518 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005520 *s = ch;
5521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 s++;
5523 }
5524
5525 return status;
5526}
5527
Tim Petersced69f82003-09-16 20:30:58 +00005528static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529int fixlower(PyUnicodeObject *self)
5530{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005531 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 Py_UNICODE *s = self->str;
5533 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005534
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005536 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005537
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005538 ch = Py_UNICODE_TOLOWER(*s);
5539 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005541 *s = ch;
5542 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 s++;
5544 }
5545
5546 return status;
5547}
5548
Tim Petersced69f82003-09-16 20:30:58 +00005549static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550int fixswapcase(PyUnicodeObject *self)
5551{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005552 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 Py_UNICODE *s = self->str;
5554 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005555
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 while (len-- > 0) {
5557 if (Py_UNICODE_ISUPPER(*s)) {
5558 *s = Py_UNICODE_TOLOWER(*s);
5559 status = 1;
5560 } else if (Py_UNICODE_ISLOWER(*s)) {
5561 *s = Py_UNICODE_TOUPPER(*s);
5562 status = 1;
5563 }
5564 s++;
5565 }
5566
5567 return status;
5568}
5569
Tim Petersced69f82003-09-16 20:30:58 +00005570static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571int fixcapitalize(PyUnicodeObject *self)
5572{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005573 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005574 Py_UNICODE *s = self->str;
5575 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005576
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005577 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005578 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005579 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005580 *s = Py_UNICODE_TOUPPER(*s);
5581 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005583 s++;
5584 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005585 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005586 *s = Py_UNICODE_TOLOWER(*s);
5587 status = 1;
5588 }
5589 s++;
5590 }
5591 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592}
5593
5594static
5595int fixtitle(PyUnicodeObject *self)
5596{
5597 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5598 register Py_UNICODE *e;
5599 int previous_is_cased;
5600
5601 /* Shortcut for single character strings */
5602 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005603 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5604 if (*p != ch) {
5605 *p = ch;
5606 return 1;
5607 }
5608 else
5609 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 }
Tim Petersced69f82003-09-16 20:30:58 +00005611
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 e = p + PyUnicode_GET_SIZE(self);
5613 previous_is_cased = 0;
5614 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005615 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005616
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005617 if (previous_is_cased)
5618 *p = Py_UNICODE_TOLOWER(ch);
5619 else
5620 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005621
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005622 if (Py_UNICODE_ISLOWER(ch) ||
5623 Py_UNICODE_ISUPPER(ch) ||
5624 Py_UNICODE_ISTITLE(ch))
5625 previous_is_cased = 1;
5626 else
5627 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 }
5629 return 1;
5630}
5631
Tim Peters8ce9f162004-08-27 01:49:32 +00005632PyObject *
5633PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634{
Tim Peters8ce9f162004-08-27 01:49:32 +00005635 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005636 const Py_UNICODE blank = ' ';
5637 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005638 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005639 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005640 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5641 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005642 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5643 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005644 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005645 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005646 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647
Benjamin Peterson7a91bf82014-02-15 13:02:52 -05005648 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00005649 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005650 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005651 }
5652
Tim Peters91879ab2004-08-27 22:35:44 +00005653 /* Grrrr. A codec may be invoked to convert str objects to
5654 * Unicode, and so it's possible to call back into Python code
5655 * during PyUnicode_FromObject(), and so it's possible for a sick
5656 * codec to change the size of fseq (if seq is a list). Therefore
5657 * we have to keep refetching the size -- can't assume seqlen
5658 * is invariant.
5659 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005660 seqlen = PySequence_Fast_GET_SIZE(fseq);
5661 /* If empty sequence, return u"". */
5662 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005663 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5664 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005665 }
5666 /* If singleton sequence with an exact Unicode, return that. */
5667 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005668 item = PySequence_Fast_GET_ITEM(fseq, 0);
5669 if (PyUnicode_CheckExact(item)) {
5670 Py_INCREF(item);
5671 res = (PyUnicodeObject *)item;
5672 goto Done;
5673 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005674 }
5675
Tim Peters05eba1f2004-08-27 21:32:02 +00005676 /* At least two items to join, or one that isn't exact Unicode. */
5677 if (seqlen > 1) {
5678 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005679 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005680 sep = &blank;
5681 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005682 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005683 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005684 internal_separator = PyUnicode_FromObject(separator);
5685 if (internal_separator == NULL)
5686 goto onError;
5687 sep = PyUnicode_AS_UNICODE(internal_separator);
5688 seplen = PyUnicode_GET_SIZE(internal_separator);
5689 /* In case PyUnicode_FromObject() mutated seq. */
5690 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005691 }
5692 }
5693
5694 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005695 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005696 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005697 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005698 res_p = PyUnicode_AS_UNICODE(res);
5699 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005700
Tim Peters05eba1f2004-08-27 21:32:02 +00005701 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005702 Py_ssize_t itemlen;
5703 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005704
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005705 item = PySequence_Fast_GET_ITEM(fseq, i);
5706 /* Convert item to Unicode. */
5707 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5708 PyErr_Format(PyExc_TypeError,
5709 "sequence item %zd: expected string or Unicode,"
5710 " %.80s found",
5711 i, Py_TYPE(item)->tp_name);
5712 goto onError;
5713 }
5714 item = PyUnicode_FromObject(item);
5715 if (item == NULL)
5716 goto onError;
5717 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005718
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005719 /* In case PyUnicode_FromObject() mutated seq. */
5720 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005721
Tim Peters8ce9f162004-08-27 01:49:32 +00005722 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005723 itemlen = PyUnicode_GET_SIZE(item);
5724 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005725 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005726 goto Overflow;
5727 if (i < seqlen - 1) {
5728 new_res_used += seplen;
5729 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005730 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005731 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005732 if (new_res_used > res_alloc) {
5733 /* double allocated size until it's big enough */
5734 do {
5735 res_alloc += res_alloc;
5736 if (res_alloc <= 0)
5737 goto Overflow;
5738 } while (new_res_used > res_alloc);
5739 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5740 Py_DECREF(item);
5741 goto onError;
5742 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005743 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005744 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005745
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005746 /* Copy item, and maybe the separator. */
5747 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5748 res_p += itemlen;
5749 if (i < seqlen - 1) {
5750 Py_UNICODE_COPY(res_p, sep, seplen);
5751 res_p += seplen;
5752 }
5753 Py_DECREF(item);
5754 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005755 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005756
Tim Peters05eba1f2004-08-27 21:32:02 +00005757 /* Shrink res to match the used area; this probably can't fail,
5758 * but it's cheap to check.
5759 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005760 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005761 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005762
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005763 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005764 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005765 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 return (PyObject *)res;
5767
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005768 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005769 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005770 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005771 Py_DECREF(item);
5772 /* fall through */
5773
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005774 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005775 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005776 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005777 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 return NULL;
5779}
5780
Tim Petersced69f82003-09-16 20:30:58 +00005781static
5782PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005783 Py_ssize_t left,
5784 Py_ssize_t right,
5785 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786{
5787 PyUnicodeObject *u;
5788
5789 if (left < 0)
5790 left = 0;
5791 if (right < 0)
5792 right = 0;
5793
Tim Peters7a29bd52001-09-12 03:03:31 +00005794 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 Py_INCREF(self);
5796 return self;
5797 }
5798
Neal Norwitze7d8be82008-07-31 17:17:14 +00005799 if (left > PY_SSIZE_T_MAX - self->length ||
5800 right > PY_SSIZE_T_MAX - (left + self->length)) {
5801 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5802 return NULL;
5803 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 u = _PyUnicode_New(left + self->length + right);
5805 if (u) {
5806 if (left)
5807 Py_UNICODE_FILL(u->str, fill, left);
5808 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5809 if (right)
5810 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5811 }
5812
5813 return u;
5814}
5815
Antoine Pitrou64672132010-01-13 07:55:48 +00005816PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819
5820 string = PyUnicode_FromObject(string);
5821 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823
Antoine Pitrou64672132010-01-13 07:55:48 +00005824 list = stringlib_splitlines(
5825 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5826 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827
5828 Py_DECREF(string);
5829 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830}
5831
Tim Petersced69f82003-09-16 20:30:58 +00005832static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005834 PyUnicodeObject *substring,
5835 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005838 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005841 return stringlib_split_whitespace(
5842 (PyObject*) self, self->str, self->length, maxcount
5843 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844
Antoine Pitrou64672132010-01-13 07:55:48 +00005845 return stringlib_split(
5846 (PyObject*) self, self->str, self->length,
5847 substring->str, substring->length,
5848 maxcount
5849 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850}
5851
Tim Petersced69f82003-09-16 20:30:58 +00005852static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005853PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005854 PyUnicodeObject *substring,
5855 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005856{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005857 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005858 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005859
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005860 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005861 return stringlib_rsplit_whitespace(
5862 (PyObject*) self, self->str, self->length, maxcount
5863 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005864
Antoine Pitrou64672132010-01-13 07:55:48 +00005865 return stringlib_rsplit(
5866 (PyObject*) self, self->str, self->length,
5867 substring->str, substring->length,
5868 maxcount
5869 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005870}
5871
5872static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005874 PyUnicodeObject *str1,
5875 PyUnicodeObject *str2,
5876 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877{
5878 PyUnicodeObject *u;
5879
5880 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005881 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005882 else if (maxcount == 0 || self->length == 0)
5883 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884
Fredrik Lundh347ee272006-05-24 16:35:18 +00005885 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005886 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005887 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005888 if (str1->length == 0)
5889 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005890 if (str1->length == 1) {
5891 /* replace characters */
5892 Py_UNICODE u1, u2;
5893 if (!findchar(self->str, self->length, str1->str[0]))
5894 goto nothing;
5895 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5896 if (!u)
5897 return NULL;
5898 Py_UNICODE_COPY(u->str, self->str, self->length);
5899 u1 = str1->str[0];
5900 u2 = str2->str[0];
5901 for (i = 0; i < u->length; i++)
5902 if (u->str[i] == u1) {
5903 if (--maxcount < 0)
5904 break;
5905 u->str[i] = u2;
5906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005908 i = stringlib_find(
5909 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005911 if (i < 0)
5912 goto nothing;
5913 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5914 if (!u)
5915 return NULL;
5916 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005917
5918 /* change everything in-place, starting with this one */
5919 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5920 i += str1->length;
5921
5922 while ( --maxcount > 0) {
5923 i = stringlib_find(self->str+i, self->length-i,
5924 str1->str, str1->length,
5925 i);
5926 if (i == -1)
5927 break;
5928 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5929 i += str1->length;
5930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005933
Brett Cannona7f13ee2010-05-04 01:16:51 +00005934 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005935 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 Py_UNICODE *p;
5937
5938 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005939 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5940 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005941 if (n == 0)
5942 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005943 /* new_size = self->length + n * (str2->length - str1->length)); */
5944 delta = (str2->length - str1->length);
5945 if (delta == 0) {
5946 new_size = self->length;
5947 } else {
5948 product = n * (str2->length - str1->length);
5949 if ((product / (str2->length - str1->length)) != n) {
5950 PyErr_SetString(PyExc_OverflowError,
5951 "replace string is too long");
5952 return NULL;
5953 }
5954 new_size = self->length + product;
5955 if (new_size < 0) {
5956 PyErr_SetString(PyExc_OverflowError,
5957 "replace string is too long");
5958 return NULL;
5959 }
5960 }
5961 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005962 if (!u)
5963 return NULL;
5964 i = 0;
5965 p = u->str;
5966 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005967 while (n-- > 0) {
5968 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005969 j = stringlib_find(self->str+i, self->length-i,
5970 str1->str, str1->length,
5971 i);
5972 if (j == -1)
5973 break;
5974 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005975 /* copy unchanged part [i:j] */
5976 Py_UNICODE_COPY(p, self->str+i, j-i);
5977 p += j - i;
5978 }
5979 /* copy substitution string */
5980 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005981 Py_UNICODE_COPY(p, str2->str, str2->length);
5982 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005983 }
5984 i = j + str1->length;
5985 }
5986 if (i < self->length)
5987 /* copy tail [i:] */
5988 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005989 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005990 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005991 while (n > 0) {
5992 Py_UNICODE_COPY(p, str2->str, str2->length);
5993 p += str2->length;
5994 if (--n <= 0)
5995 break;
5996 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005998 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 }
6000 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006002
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006003 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006004 /* nothing to replace; return original string (when possible) */
6005 if (PyUnicode_CheckExact(self)) {
6006 Py_INCREF(self);
6007 return (PyObject *) self;
6008 }
6009 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010}
6011
6012/* --- Unicode Object Methods --------------------------------------------- */
6013
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006014PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006015 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016\n\
6017Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006018characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
6020static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006021unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 return fixup(self, fixtitle);
6024}
6025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006026PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006027 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028\n\
6029Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00006030have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031
6032static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006033unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 return fixup(self, fixcapitalize);
6036}
6037
6038#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006039PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006040 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041\n\
6042Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006043normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044
6045static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006046unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047{
6048 PyObject *list;
6049 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006050 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 /* Split into words */
6053 list = split(self, NULL, -1);
6054 if (!list)
6055 return NULL;
6056
6057 /* Capitalize each word */
6058 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6059 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006060 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 if (item == NULL)
6062 goto onError;
6063 Py_DECREF(PyList_GET_ITEM(list, i));
6064 PyList_SET_ITEM(list, i, item);
6065 }
6066
6067 /* Join the words to form a new string */
6068 item = PyUnicode_Join(NULL, list);
6069
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006070 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 Py_DECREF(list);
6072 return (PyObject *)item;
6073}
6074#endif
6075
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006076/* Argument converter. Coerces to a single unicode character */
6077
6078static int
6079convert_uc(PyObject *obj, void *addr)
6080{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006081 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6082 PyObject *uniobj;
6083 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006084
Benjamin Peterson857ce152009-01-31 16:29:18 +00006085 uniobj = PyUnicode_FromObject(obj);
6086 if (uniobj == NULL) {
6087 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006088 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006089 return 0;
6090 }
6091 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6092 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006093 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006094 Py_DECREF(uniobj);
6095 return 0;
6096 }
6097 unistr = PyUnicode_AS_UNICODE(uniobj);
6098 *fillcharloc = unistr[0];
6099 Py_DECREF(uniobj);
6100 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006101}
6102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006103PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006104 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006106Return S centered in a Unicode string of length width. Padding is\n\
6107done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108
6109static PyObject *
6110unicode_center(PyUnicodeObject *self, PyObject *args)
6111{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006112 Py_ssize_t marg, left;
6113 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006114 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115
Thomas Woutersde017742006-02-16 19:34:37 +00006116 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 return NULL;
6118
Tim Peters7a29bd52001-09-12 03:03:31 +00006119 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 Py_INCREF(self);
6121 return (PyObject*) self;
6122 }
6123
6124 marg = width - self->length;
6125 left = marg / 2 + (marg & width & 1);
6126
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006127 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128}
6129
Marc-André Lemburge5034372000-08-08 08:04:29 +00006130#if 0
6131
6132/* This code should go into some future Unicode collation support
6133 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006134 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006135
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006136/* speedy UTF-16 code point order comparison */
6137/* gleaned from: */
6138/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6139
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006140static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006141{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006142 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006143 0, 0, 0, 0, 0, 0, 0, 0,
6144 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006145 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006146};
6147
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148static int
6149unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6150{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006151 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006152
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 Py_UNICODE *s1 = str1->str;
6154 Py_UNICODE *s2 = str2->str;
6155
6156 len1 = str1->length;
6157 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006158
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006160 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006161
6162 c1 = *s1++;
6163 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006164
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006165 if (c1 > (1<<11) * 26)
6166 c1 += utf16Fixup[c1>>11];
6167 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006168 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006169 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006170
6171 if (c1 != c2)
6172 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006173
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006174 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 }
6176
6177 return (len1 < len2) ? -1 : (len1 != len2);
6178}
6179
Marc-André Lemburge5034372000-08-08 08:04:29 +00006180#else
6181
6182static int
6183unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6184{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006185 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006186
6187 Py_UNICODE *s1 = str1->str;
6188 Py_UNICODE *s2 = str2->str;
6189
6190 len1 = str1->length;
6191 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006192
Marc-André Lemburge5034372000-08-08 08:04:29 +00006193 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006194 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006195
Fredrik Lundh45714e92001-06-26 16:39:36 +00006196 c1 = *s1++;
6197 c2 = *s2++;
6198
6199 if (c1 != c2)
6200 return (c1 < c2) ? -1 : 1;
6201
Marc-André Lemburge5034372000-08-08 08:04:29 +00006202 len1--; len2--;
6203 }
6204
6205 return (len1 < len2) ? -1 : (len1 != len2);
6206}
6207
6208#endif
6209
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006211 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212{
6213 PyUnicodeObject *u = NULL, *v = NULL;
6214 int result;
6215
6216 /* Coerce the two arguments */
6217 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6218 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006219 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6221 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006222 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223
Thomas Wouters7e474022000-07-16 12:04:32 +00006224 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006226 Py_DECREF(u);
6227 Py_DECREF(v);
6228 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 }
6230
6231 result = unicode_compare(u, v);
6232
6233 Py_DECREF(u);
6234 Py_DECREF(v);
6235 return result;
6236
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006237 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 Py_XDECREF(u);
6239 Py_XDECREF(v);
6240 return -1;
6241}
6242
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006243PyObject *PyUnicode_RichCompare(PyObject *left,
6244 PyObject *right,
6245 int op)
6246{
6247 int result;
6248
6249 result = PyUnicode_Compare(left, right);
6250 if (result == -1 && PyErr_Occurred())
6251 goto onError;
6252
6253 /* Convert the return value to a Boolean */
6254 switch (op) {
6255 case Py_EQ:
6256 result = (result == 0);
6257 break;
6258 case Py_NE:
6259 result = (result != 0);
6260 break;
6261 case Py_LE:
6262 result = (result <= 0);
6263 break;
6264 case Py_GE:
6265 result = (result >= 0);
6266 break;
6267 case Py_LT:
6268 result = (result == -1);
6269 break;
6270 case Py_GT:
6271 result = (result == 1);
6272 break;
6273 }
6274 return PyBool_FromLong(result);
6275
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006276 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006277
6278 /* Standard case
6279
6280 Type errors mean that PyUnicode_FromObject() could not convert
6281 one of the arguments (usually the right hand side) to Unicode,
6282 ie. we can't handle the comparison request. However, it is
6283 possible that the other object knows a comparison method, which
6284 is why we return Py_NotImplemented to give the other object a
6285 chance.
6286
6287 */
6288 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6289 PyErr_Clear();
6290 Py_INCREF(Py_NotImplemented);
6291 return Py_NotImplemented;
6292 }
6293 if (op != Py_EQ && op != Py_NE)
6294 return NULL;
6295
6296 /* Equality comparison.
6297
6298 This is a special case: we silence any PyExc_UnicodeDecodeError
6299 and instead turn it into a PyErr_UnicodeWarning.
6300
6301 */
6302 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6303 return NULL;
6304 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006305 if (PyErr_Warn(PyExc_UnicodeWarning,
6306 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006307 "Unicode equal comparison "
6308 "failed to convert both arguments to Unicode - "
6309 "interpreting them as being unequal" :
6310 "Unicode unequal comparison "
6311 "failed to convert both arguments to Unicode - "
6312 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006313 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006314 return NULL;
6315 result = (op == Py_NE);
6316 return PyBool_FromLong(result);
6317}
6318
Guido van Rossum403d68b2000-03-13 15:55:09 +00006319int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006320 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006321{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006322 PyObject *str, *sub;
6323 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006324
6325 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006326 sub = PyUnicode_FromObject(element);
6327 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006328 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006329 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006330
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006331 str = PyUnicode_FromObject(container);
6332 if (!str) {
6333 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006334 return -1;
6335 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006336
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006337 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006338
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006339 Py_DECREF(str);
6340 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006341
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006342 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006343}
6344
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345/* Concat to string or Unicode object giving a new Unicode object. */
6346
6347PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006348 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349{
6350 PyUnicodeObject *u = NULL, *v = NULL, *w;
6351
6352 /* Coerce the two arguments */
6353 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6354 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006355 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6357 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006358 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359
6360 /* Shortcuts */
6361 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006362 Py_DECREF(v);
6363 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364 }
6365 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006366 Py_DECREF(u);
6367 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 }
6369
6370 /* Concat the two Unicode strings */
6371 w = _PyUnicode_New(u->length + v->length);
6372 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006373 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 Py_UNICODE_COPY(w->str, u->str, u->length);
6375 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6376
6377 Py_DECREF(u);
6378 Py_DECREF(v);
6379 return (PyObject *)w;
6380
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006381 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382 Py_XDECREF(u);
6383 Py_XDECREF(v);
6384 return NULL;
6385}
6386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006387PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006388 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006390Return the number of non-overlapping occurrences of substring sub in\n\
6391Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006392interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393
6394static PyObject *
6395unicode_count(PyUnicodeObject *self, PyObject *args)
6396{
6397 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006398 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006399 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 PyObject *result;
6401
Jesus Cea44e81682011-04-20 16:39:15 +02006402 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6403 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006404 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006405
Antoine Pitrou64672132010-01-13 07:55:48 +00006406 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006407 result = PyInt_FromSsize_t(
6408 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006409 substring->str, substring->length,
6410 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006411 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412
6413 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006414
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 return result;
6416}
6417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006418PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006419 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006421Encodes S using the codec registered for encoding. encoding defaults\n\
6422to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006423handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6425'xmlcharrefreplace' as well as any other name registered with\n\
6426codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427
6428static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006429unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006431 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 char *encoding = NULL;
6433 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006434 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006435
Benjamin Peterson332d7212009-09-18 21:14:55 +00006436 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6437 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006439 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006440 if (v == NULL)
6441 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006442 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006443 PyErr_Format(PyExc_TypeError,
6444 "encoder did not return a string/unicode object "
6445 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006446 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006447 Py_DECREF(v);
6448 return NULL;
6449 }
6450 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006451
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006452 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006453 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006454}
6455
6456PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006457 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006458\n\
6459Decodes S using the codec registered for encoding. encoding defaults\n\
6460to the default encoding. errors may be given to set a different error\n\
6461handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6462a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006463as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006464able to handle UnicodeDecodeErrors.");
6465
6466static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006467unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006468{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006469 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006470 char *encoding = NULL;
6471 char *errors = NULL;
6472 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006473
Benjamin Peterson332d7212009-09-18 21:14:55 +00006474 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6475 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006476 return NULL;
6477 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006478 if (v == NULL)
6479 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006480 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006481 PyErr_Format(PyExc_TypeError,
6482 "decoder did not return a string/unicode object "
6483 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006484 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006485 Py_DECREF(v);
6486 return NULL;
6487 }
6488 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006489
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006490 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006491 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492}
6493
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006494PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006495 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496\n\
6497Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006498If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499
6500static PyObject*
6501unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6502{
6503 Py_UNICODE *e;
6504 Py_UNICODE *p;
6505 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006506 Py_UNICODE *qe;
6507 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 PyUnicodeObject *u;
6509 int tabsize = 8;
6510
6511 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
Thomas Wouters7e474022000-07-16 12:04:32 +00006514 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006515 i = 0; /* chars up to and including most recent \n or \r */
6516 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6517 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 for (p = self->str; p < e; p++)
6519 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006520 if (tabsize > 0) {
6521 incr = tabsize - (j % tabsize); /* cannot overflow */
6522 if (j > PY_SSIZE_T_MAX - incr)
6523 goto overflow1;
6524 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006525 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006526 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006528 if (j > PY_SSIZE_T_MAX - 1)
6529 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 j++;
6531 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006532 if (i > PY_SSIZE_T_MAX - j)
6533 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006535 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 }
6537 }
6538
Guido van Rossum5bdff602008-03-11 21:18:06 +00006539 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006540 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006541
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 /* Second pass: create output string and fill it */
6543 u = _PyUnicode_New(i + j);
6544 if (!u)
6545 return NULL;
6546
Guido van Rossum5bdff602008-03-11 21:18:06 +00006547 j = 0; /* same as in first pass */
6548 q = u->str; /* next output char */
6549 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550
6551 for (p = self->str; p < e; p++)
6552 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006553 if (tabsize > 0) {
6554 i = tabsize - (j % tabsize);
6555 j += i;
6556 while (i--) {
6557 if (q >= qe)
6558 goto overflow2;
6559 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006560 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006561 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006562 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006563 else {
6564 if (q >= qe)
6565 goto overflow2;
6566 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006567 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 if (*p == '\n' || *p == '\r')
6569 j = 0;
6570 }
6571
6572 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006573
6574 overflow2:
6575 Py_DECREF(u);
6576 overflow1:
6577 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6578 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579}
6580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006581PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006582 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583\n\
6584Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006585such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586arguments start and end are interpreted as in slice notation.\n\
6587\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006588Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589
6590static PyObject *
6591unicode_find(PyUnicodeObject *self, PyObject *args)
6592{
Jesus Cea44e81682011-04-20 16:39:15 +02006593 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006594 Py_ssize_t start;
6595 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006596 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597
Jesus Cea44e81682011-04-20 16:39:15 +02006598 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6599 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006602 result = stringlib_find_slice(
6603 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6604 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6605 start, end
6606 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607
6608 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006609
6610 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611}
6612
6613static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006614unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615{
6616 if (index < 0 || index >= self->length) {
6617 PyErr_SetString(PyExc_IndexError, "string index out of range");
6618 return NULL;
6619 }
6620
6621 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6622}
6623
6624static long
6625unicode_hash(PyUnicodeObject *self)
6626{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006627 /* Since Unicode objects compare equal to their ASCII string
6628 counterparts, they should use the individual character values
6629 as basis for their hash value. This is needed to assure that
6630 strings and Unicode objects behave in the same way as
6631 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632
Martin v. Löwis18e16552006-02-15 17:27:45 +00006633 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006634 register Py_UNICODE *p;
6635 register long x;
6636
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006637#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006638 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006639#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006641 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006642 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006643 /*
6644 We make the hash of the empty string be 0, rather than using
6645 (prefix ^ suffix), since this slightly obfuscates the hash secret
6646 */
6647 if (len == 0) {
6648 self->hash = 0;
6649 return 0;
6650 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006651 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006652 x = _Py_HashSecret.prefix;
6653 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006654 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006655 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006656 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006657 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006658 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006659 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006660 self->hash = x;
6661 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662}
6663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006664PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006665 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006667Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668
6669static PyObject *
6670unicode_index(PyUnicodeObject *self, PyObject *args)
6671{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006672 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006673 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006674 Py_ssize_t start;
6675 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676
Jesus Cea44e81682011-04-20 16:39:15 +02006677 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6678 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006681 result = stringlib_find_slice(
6682 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6683 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6684 start, end
6685 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686
6687 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006688
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 if (result < 0) {
6690 PyErr_SetString(PyExc_ValueError, "substring not found");
6691 return NULL;
6692 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006693
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695}
6696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006697PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006698 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006700Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006701at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702
6703static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006704unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705{
6706 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6707 register const Py_UNICODE *e;
6708 int cased;
6709
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 /* Shortcut for single character strings */
6711 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006712 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006714 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006715 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006716 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006717
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 e = p + PyUnicode_GET_SIZE(self);
6719 cased = 0;
6720 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006721 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006722
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006723 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6724 return PyBool_FromLong(0);
6725 else if (!cased && Py_UNICODE_ISLOWER(ch))
6726 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006728 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729}
6730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006731PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006732 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006734Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006735at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736
6737static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006738unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739{
6740 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6741 register const Py_UNICODE *e;
6742 int cased;
6743
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 /* Shortcut for single character strings */
6745 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006746 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006748 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006749 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006750 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006751
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 e = p + PyUnicode_GET_SIZE(self);
6753 cased = 0;
6754 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006755 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006756
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006757 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6758 return PyBool_FromLong(0);
6759 else if (!cased && Py_UNICODE_ISUPPER(ch))
6760 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006762 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763}
6764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006765PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006766 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006768Return True if S is a titlecased string and there is at least one\n\
6769character in S, i.e. upper- and titlecase characters may only\n\
6770follow uncased characters and lowercase characters only cased ones.\n\
6771Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772
6773static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006774unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775{
6776 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6777 register const Py_UNICODE *e;
6778 int cased, previous_is_cased;
6779
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 /* Shortcut for single character strings */
6781 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006782 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6783 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006785 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006786 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006787 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006788
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 e = p + PyUnicode_GET_SIZE(self);
6790 cased = 0;
6791 previous_is_cased = 0;
6792 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006793 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006794
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006795 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6796 if (previous_is_cased)
6797 return PyBool_FromLong(0);
6798 previous_is_cased = 1;
6799 cased = 1;
6800 }
6801 else if (Py_UNICODE_ISLOWER(ch)) {
6802 if (!previous_is_cased)
6803 return PyBool_FromLong(0);
6804 previous_is_cased = 1;
6805 cased = 1;
6806 }
6807 else
6808 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006810 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811}
6812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006813PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006814 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006816Return True if all characters in S are whitespace\n\
6817and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818
6819static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006820unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821{
6822 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6823 register const Py_UNICODE *e;
6824
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 /* Shortcut for single character strings */
6826 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006827 Py_UNICODE_ISSPACE(*p))
6828 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006830 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006831 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006832 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006833
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 e = p + PyUnicode_GET_SIZE(self);
6835 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006836 if (!Py_UNICODE_ISSPACE(*p))
6837 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006839 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840}
6841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006842PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006843 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006844\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006845Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006846and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006847
6848static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006849unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006850{
6851 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6852 register const Py_UNICODE *e;
6853
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006854 /* Shortcut for single character strings */
6855 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006856 Py_UNICODE_ISALPHA(*p))
6857 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006858
6859 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006860 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006861 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006862
6863 e = p + PyUnicode_GET_SIZE(self);
6864 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006865 if (!Py_UNICODE_ISALPHA(*p))
6866 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006867 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006868 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006869}
6870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006871PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006872 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006873\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006874Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006875and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006876
6877static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006878unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006879{
6880 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6881 register const Py_UNICODE *e;
6882
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006883 /* Shortcut for single character strings */
6884 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006885 Py_UNICODE_ISALNUM(*p))
6886 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006887
6888 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006889 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006890 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006891
6892 e = p + PyUnicode_GET_SIZE(self);
6893 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006894 if (!Py_UNICODE_ISALNUM(*p))
6895 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006896 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006897 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006898}
6899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006900PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006901 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006903Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006904False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905
6906static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006907unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908{
6909 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6910 register const Py_UNICODE *e;
6911
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 /* Shortcut for single character strings */
6913 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006914 Py_UNICODE_ISDECIMAL(*p))
6915 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006917 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006918 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006919 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006920
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 e = p + PyUnicode_GET_SIZE(self);
6922 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006923 if (!Py_UNICODE_ISDECIMAL(*p))
6924 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006926 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927}
6928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006929PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006930 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006932Return True if all characters in S are digits\n\
6933and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934
6935static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006936unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937{
6938 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6939 register const Py_UNICODE *e;
6940
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 /* Shortcut for single character strings */
6942 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006943 Py_UNICODE_ISDIGIT(*p))
6944 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006946 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006947 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006948 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 e = p + PyUnicode_GET_SIZE(self);
6951 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006952 if (!Py_UNICODE_ISDIGIT(*p))
6953 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006955 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956}
6957
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006958PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006959 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006961Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006962False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963
6964static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006965unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966{
6967 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6968 register const Py_UNICODE *e;
6969
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970 /* Shortcut for single character strings */
6971 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006972 Py_UNICODE_ISNUMERIC(*p))
6973 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006975 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006976 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006977 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006978
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 e = p + PyUnicode_GET_SIZE(self);
6980 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006981 if (!Py_UNICODE_ISNUMERIC(*p))
6982 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006984 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985}
6986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006987PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006988 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989\n\
6990Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006991iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992
6993static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006994unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006996 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997}
6998
Martin v. Löwis18e16552006-02-15 17:27:45 +00006999static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000unicode_length(PyUnicodeObject *self)
7001{
7002 return self->length;
7003}
7004
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007005PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007006 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007008Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007009done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010
7011static PyObject *
7012unicode_ljust(PyUnicodeObject *self, PyObject *args)
7013{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007014 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007015 Py_UNICODE fillchar = ' ';
7016
Martin v. Löwis412fb672006-04-13 06:34:32 +00007017 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 return NULL;
7019
Tim Peters7a29bd52001-09-12 03:03:31 +00007020 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 Py_INCREF(self);
7022 return (PyObject*) self;
7023 }
7024
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007025 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026}
7027
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007028PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007029 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007031Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032
7033static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007034unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 return fixup(self, fixlower);
7037}
7038
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007039#define LEFTSTRIP 0
7040#define RIGHTSTRIP 1
7041#define BOTHSTRIP 2
7042
7043/* Arrays indexed by above */
7044static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7045
7046#define STRIPNAME(i) (stripformat[i]+3)
7047
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007048/* externally visible for str.strip(unicode) */
7049PyObject *
7050_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7051{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007052 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7053 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7054 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7055 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7056 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007057
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007058 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007059
Benjamin Peterson857ce152009-01-31 16:29:18 +00007060 i = 0;
7061 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007062 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7063 i++;
7064 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007065 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007066
Benjamin Peterson857ce152009-01-31 16:29:18 +00007067 j = len;
7068 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007069 do {
7070 j--;
7071 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7072 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007073 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007074
Benjamin Peterson857ce152009-01-31 16:29:18 +00007075 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007076 Py_INCREF(self);
7077 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007078 }
7079 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007080 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007081}
7082
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083
7084static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007085do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007087 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7088 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007089
Benjamin Peterson857ce152009-01-31 16:29:18 +00007090 i = 0;
7091 if (striptype != RIGHTSTRIP) {
7092 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7093 i++;
7094 }
7095 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007096
Benjamin Peterson857ce152009-01-31 16:29:18 +00007097 j = len;
7098 if (striptype != LEFTSTRIP) {
7099 do {
7100 j--;
7101 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7102 j++;
7103 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007104
Benjamin Peterson857ce152009-01-31 16:29:18 +00007105 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7106 Py_INCREF(self);
7107 return (PyObject*)self;
7108 }
7109 else
7110 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111}
7112
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007113
7114static PyObject *
7115do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7116{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007117 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007118
Benjamin Peterson857ce152009-01-31 16:29:18 +00007119 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7120 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007121
Benjamin Peterson857ce152009-01-31 16:29:18 +00007122 if (sep != NULL && sep != Py_None) {
7123 if (PyUnicode_Check(sep))
7124 return _PyUnicode_XStrip(self, striptype, sep);
7125 else if (PyString_Check(sep)) {
7126 PyObject *res;
7127 sep = PyUnicode_FromObject(sep);
7128 if (sep==NULL)
7129 return NULL;
7130 res = _PyUnicode_XStrip(self, striptype, sep);
7131 Py_DECREF(sep);
7132 return res;
7133 }
7134 else {
7135 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007136 "%s arg must be None, unicode or str",
7137 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007138 return NULL;
7139 }
7140 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007141
Benjamin Peterson857ce152009-01-31 16:29:18 +00007142 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007143}
7144
7145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007146PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007147 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007148\n\
7149Return a copy of the string S with leading and trailing\n\
7150whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007151If chars is given and not None, remove characters in chars instead.\n\
7152If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007153
7154static PyObject *
7155unicode_strip(PyUnicodeObject *self, PyObject *args)
7156{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007157 if (PyTuple_GET_SIZE(args) == 0)
7158 return do_strip(self, BOTHSTRIP); /* Common case */
7159 else
7160 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007161}
7162
7163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007164PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007165 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007166\n\
7167Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007168If chars is given and not None, remove characters in chars instead.\n\
7169If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007170
7171static PyObject *
7172unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7173{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007174 if (PyTuple_GET_SIZE(args) == 0)
7175 return do_strip(self, LEFTSTRIP); /* Common case */
7176 else
7177 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007178}
7179
7180
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007181PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007182 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007183\n\
7184Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007185If chars is given and not None, remove characters in chars instead.\n\
7186If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007187
7188static PyObject *
7189unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7190{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007191 if (PyTuple_GET_SIZE(args) == 0)
7192 return do_strip(self, RIGHTSTRIP); /* Common case */
7193 else
7194 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007195}
7196
7197
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007199unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200{
7201 PyUnicodeObject *u;
7202 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007203 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007204 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205
7206 if (len < 0)
7207 len = 0;
7208
Tim Peters7a29bd52001-09-12 03:03:31 +00007209 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 /* no repeat, return original string */
7211 Py_INCREF(str);
7212 return (PyObject*) str;
7213 }
Tim Peters8f422462000-09-09 06:13:41 +00007214
7215 /* ensure # of chars needed doesn't overflow int and # of bytes
7216 * needed doesn't overflow size_t
7217 */
7218 nchars = len * str->length;
7219 if (len && nchars / len != str->length) {
7220 PyErr_SetString(PyExc_OverflowError,
7221 "repeated string is too long");
7222 return NULL;
7223 }
7224 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7225 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7226 PyErr_SetString(PyExc_OverflowError,
7227 "repeated string is too long");
7228 return NULL;
7229 }
7230 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 if (!u)
7232 return NULL;
7233
7234 p = u->str;
7235
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007236 if (str->length == 1 && len > 0) {
7237 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007238 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007239 Py_ssize_t done = 0; /* number of characters copied this far */
7240 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007241 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007242 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007243 }
7244 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007245 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007246 Py_UNICODE_COPY(p+done, p, n);
7247 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007248 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250
7251 return (PyObject*) u;
7252}
7253
7254PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007255 PyObject *subobj,
7256 PyObject *replobj,
7257 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258{
7259 PyObject *self;
7260 PyObject *str1;
7261 PyObject *str2;
7262 PyObject *result;
7263
7264 self = PyUnicode_FromObject(obj);
7265 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267 str1 = PyUnicode_FromObject(subobj);
7268 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007269 Py_DECREF(self);
7270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271 }
7272 str2 = PyUnicode_FromObject(replobj);
7273 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007274 Py_DECREF(self);
7275 Py_DECREF(str1);
7276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 }
Tim Petersced69f82003-09-16 20:30:58 +00007278 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007279 (PyUnicodeObject *)str1,
7280 (PyUnicodeObject *)str2,
7281 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282 Py_DECREF(self);
7283 Py_DECREF(str1);
7284 Py_DECREF(str2);
7285 return result;
7286}
7287
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007288PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007289 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290\n\
7291Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007292old replaced by new. If the optional argument count is\n\
7293given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294
7295static PyObject*
7296unicode_replace(PyUnicodeObject *self, PyObject *args)
7297{
7298 PyUnicodeObject *str1;
7299 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007300 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 PyObject *result;
7302
Martin v. Löwis18e16552006-02-15 17:27:45 +00007303 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 return NULL;
7305 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7306 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007307 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007309 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007310 Py_DECREF(str1);
7311 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007312 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313
7314 result = replace(self, str1, str2, maxcount);
7315
7316 Py_DECREF(str1);
7317 Py_DECREF(str2);
7318 return result;
7319}
7320
7321static
7322PyObject *unicode_repr(PyObject *unicode)
7323{
7324 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007325 PyUnicode_GET_SIZE(unicode),
7326 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327}
7328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007329PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007330 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331\n\
7332Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007333such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334arguments start and end are interpreted as in slice notation.\n\
7335\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007336Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337
7338static PyObject *
7339unicode_rfind(PyUnicodeObject *self, PyObject *args)
7340{
Jesus Cea44e81682011-04-20 16:39:15 +02007341 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007342 Py_ssize_t start;
7343 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007344 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345
Jesus Cea44e81682011-04-20 16:39:15 +02007346 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7347 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007350 result = stringlib_rfind_slice(
7351 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7352 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7353 start, end
7354 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355
7356 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007357
7358 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359}
7360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007361PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007362 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007364Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365
7366static PyObject *
7367unicode_rindex(PyUnicodeObject *self, PyObject *args)
7368{
Jesus Cea44e81682011-04-20 16:39:15 +02007369 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007370 Py_ssize_t start;
7371 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007372 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
Jesus Cea44e81682011-04-20 16:39:15 +02007374 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7375 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007376 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007378 result = stringlib_rfind_slice(
7379 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7380 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7381 start, end
7382 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383
7384 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007385
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 if (result < 0) {
7387 PyErr_SetString(PyExc_ValueError, "substring not found");
7388 return NULL;
7389 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007390 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391}
7392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007393PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007394 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007396Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007397done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398
7399static PyObject *
7400unicode_rjust(PyUnicodeObject *self, PyObject *args)
7401{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007402 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007403 Py_UNICODE fillchar = ' ';
7404
Martin v. Löwis412fb672006-04-13 06:34:32 +00007405 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406 return NULL;
7407
Tim Peters7a29bd52001-09-12 03:03:31 +00007408 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409 Py_INCREF(self);
7410 return (PyObject*) self;
7411 }
7412
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007413 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414}
7415
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007417unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418{
7419 /* standard clamping */
7420 if (start < 0)
7421 start = 0;
7422 if (end < 0)
7423 end = 0;
7424 if (end > self->length)
7425 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007426 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427 /* full slice, return original string */
7428 Py_INCREF(self);
7429 return (PyObject*) self;
7430 }
7431 if (start > end)
7432 start = end;
7433 /* copy slice */
7434 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007435 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436}
7437
7438PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007439 PyObject *sep,
7440 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441{
7442 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007443
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 s = PyUnicode_FromObject(s);
7445 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007446 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007447 if (sep != NULL) {
7448 sep = PyUnicode_FromObject(sep);
7449 if (sep == NULL) {
7450 Py_DECREF(s);
7451 return NULL;
7452 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 }
7454
7455 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7456
7457 Py_DECREF(s);
7458 Py_XDECREF(sep);
7459 return result;
7460}
7461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007462PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007463 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464\n\
7465Return a list of the words in S, using sep as the\n\
7466delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007467splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007468whitespace string is a separator and empty strings are\n\
7469removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470
7471static PyObject*
7472unicode_split(PyUnicodeObject *self, PyObject *args)
7473{
7474 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007475 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476
Martin v. Löwis18e16552006-02-15 17:27:45 +00007477 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478 return NULL;
7479
7480 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007481 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007483 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007485 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486}
7487
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007488PyObject *
7489PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7490{
7491 PyObject* str_obj;
7492 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007493 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007494
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007495 str_obj = PyUnicode_FromObject(str_in);
7496 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007497 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007498 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007499 if (!sep_obj) {
7500 Py_DECREF(str_obj);
7501 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007502 }
7503
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007504 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007505 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7506 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7507 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007508
Fredrik Lundhb9479482006-05-26 17:22:38 +00007509 Py_DECREF(sep_obj);
7510 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007511
7512 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007513}
7514
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007515
7516PyObject *
7517PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7518{
7519 PyObject* str_obj;
7520 PyObject* sep_obj;
7521 PyObject* out;
7522
7523 str_obj = PyUnicode_FromObject(str_in);
7524 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007525 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007526 sep_obj = PyUnicode_FromObject(sep_in);
7527 if (!sep_obj) {
7528 Py_DECREF(str_obj);
7529 return NULL;
7530 }
7531
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007532 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007533 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7534 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7535 );
7536
7537 Py_DECREF(sep_obj);
7538 Py_DECREF(str_obj);
7539
7540 return out;
7541}
7542
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007543PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007544 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007545\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007546Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007547the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007548found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007549
7550static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007551unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007552{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007553 return PyUnicode_Partition((PyObject *)self, separator);
7554}
7555
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007556PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007557 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007558\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007559Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007560the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007561separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007562
7563static PyObject*
7564unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7565{
7566 return PyUnicode_RPartition((PyObject *)self, separator);
7567}
7568
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007569PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007570 PyObject *sep,
7571 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007572{
7573 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007574
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007575 s = PyUnicode_FromObject(s);
7576 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007577 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007578 if (sep != NULL) {
7579 sep = PyUnicode_FromObject(sep);
7580 if (sep == NULL) {
7581 Py_DECREF(s);
7582 return NULL;
7583 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007584 }
7585
7586 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7587
7588 Py_DECREF(s);
7589 Py_XDECREF(sep);
7590 return result;
7591}
7592
7593PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007594 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007595\n\
7596Return a list of the words in S, using sep as the\n\
7597delimiter string, starting at the end of the string and\n\
7598working to the front. If maxsplit is given, at most maxsplit\n\
7599splits are done. If sep is not specified, any whitespace string\n\
7600is a separator.");
7601
7602static PyObject*
7603unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7604{
7605 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007606 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007607
Martin v. Löwis18e16552006-02-15 17:27:45 +00007608 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007609 return NULL;
7610
7611 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007612 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007613 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007614 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007615 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007616 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007617}
7618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007619PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007620 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621\n\
7622Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007623Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007624is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625
7626static PyObject*
7627unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7628{
Guido van Rossum86662912000-04-11 15:38:46 +00007629 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630
Guido van Rossum86662912000-04-11 15:38:46 +00007631 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632 return NULL;
7633
Guido van Rossum86662912000-04-11 15:38:46 +00007634 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635}
7636
7637static
7638PyObject *unicode_str(PyUnicodeObject *self)
7639{
Fred Drakee4315f52000-05-09 19:53:39 +00007640 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641}
7642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007643PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007644 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645\n\
7646Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007647and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648
7649static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007650unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652 return fixup(self, fixswapcase);
7653}
7654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007655PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007656 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657\n\
7658Return a copy of the string S, where all characters have been mapped\n\
7659through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007660Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7661Unmapped characters are left untouched. Characters mapped to None\n\
7662are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663
7664static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007665unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666{
Tim Petersced69f82003-09-16 20:30:58 +00007667 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007668 self->length,
7669 table,
7670 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671}
7672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007673PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007674 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007676Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677
7678static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007679unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681 return fixup(self, fixupper);
7682}
7683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007684PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007685 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686\n\
Georg Brandl98064072008-09-09 19:26:00 +00007687Pad a numeric string S with zeros on the left, to fill a field\n\
7688of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689
7690static PyObject *
7691unicode_zfill(PyUnicodeObject *self, PyObject *args)
7692{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007693 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 PyUnicodeObject *u;
7695
Martin v. Löwis18e16552006-02-15 17:27:45 +00007696 Py_ssize_t width;
7697 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 return NULL;
7699
7700 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007701 if (PyUnicode_CheckExact(self)) {
7702 Py_INCREF(self);
7703 return (PyObject*) self;
7704 }
7705 else
7706 return PyUnicode_FromUnicode(
7707 PyUnicode_AS_UNICODE(self),
7708 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007709 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 }
7711
7712 fill = width - self->length;
7713
7714 u = pad(self, fill, 0, '0');
7715
Walter Dörwald068325e2002-04-15 13:36:47 +00007716 if (u == NULL)
7717 return NULL;
7718
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719 if (u->str[fill] == '+' || u->str[fill] == '-') {
7720 /* move sign to beginning of string */
7721 u->str[0] = u->str[fill];
7722 u->str[fill] = '0';
7723 }
7724
7725 return (PyObject*) u;
7726}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727
7728#if 0
7729static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007730free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007732 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733}
7734#endif
7735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007736PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007737 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007739Return True if S starts with the specified prefix, False otherwise.\n\
7740With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007741With optional end, stop comparing S at that position.\n\
7742prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743
7744static PyObject *
7745unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007746 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747{
Georg Brandl24250812006-06-09 18:45:48 +00007748 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007750 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007751 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007752 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753
Jesus Cea44e81682011-04-20 16:39:15 +02007754 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007755 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007756 if (PyTuple_Check(subobj)) {
7757 Py_ssize_t i;
7758 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7759 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007760 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007761 if (substring == NULL)
7762 return NULL;
7763 result = tailmatch(self, substring, start, end, -1);
7764 Py_DECREF(substring);
7765 if (result) {
7766 Py_RETURN_TRUE;
7767 }
7768 }
7769 /* nothing matched */
7770 Py_RETURN_FALSE;
7771 }
7772 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007773 if (substring == NULL) {
7774 if (PyErr_ExceptionMatches(PyExc_TypeError))
7775 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7776 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007777 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007778 }
Georg Brandl24250812006-06-09 18:45:48 +00007779 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007781 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782}
7783
7784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007785PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007786 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007788Return True if S ends with the specified suffix, False otherwise.\n\
7789With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007790With optional end, stop comparing S at that position.\n\
7791suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792
7793static PyObject *
7794unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007795 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796{
Georg Brandl24250812006-06-09 18:45:48 +00007797 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007799 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007800 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007801 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802
Jesus Cea44e81682011-04-20 16:39:15 +02007803 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007804 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007805 if (PyTuple_Check(subobj)) {
7806 Py_ssize_t i;
7807 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7808 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007809 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007810 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007811 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007812 result = tailmatch(self, substring, start, end, +1);
7813 Py_DECREF(substring);
7814 if (result) {
7815 Py_RETURN_TRUE;
7816 }
7817 }
7818 Py_RETURN_FALSE;
7819 }
7820 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007821 if (substring == NULL) {
7822 if (PyErr_ExceptionMatches(PyExc_TypeError))
7823 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7824 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007825 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007826 }
Georg Brandl24250812006-06-09 18:45:48 +00007827 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007829 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830}
7831
7832
Eric Smitha9f7d622008-02-17 19:46:49 +00007833/* Implements do_string_format, which is unicode because of stringlib */
7834#include "stringlib/string_format.h"
7835
7836PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007837 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007838\n\
Eric Smith6c840852010-11-06 19:43:44 +00007839Return a formatted version of S, using substitutions from args and kwargs.\n\
7840The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007841
Eric Smithdc13b792008-05-30 18:10:04 +00007842static PyObject *
7843unicode__format__(PyObject *self, PyObject *args)
7844{
7845 PyObject *format_spec;
7846 PyObject *result = NULL;
7847 PyObject *tmp = NULL;
7848
7849 /* If 2.x, convert format_spec to the same type as value */
7850 /* This is to allow things like u''.format('') */
7851 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7852 goto done;
7853 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7854 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007855 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007856 goto done;
7857 }
7858 tmp = PyObject_Unicode(format_spec);
7859 if (tmp == NULL)
7860 goto done;
7861 format_spec = tmp;
7862
7863 result = _PyUnicode_FormatAdvanced(self,
7864 PyUnicode_AS_UNICODE(format_spec),
7865 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007866 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007867 Py_XDECREF(tmp);
7868 return result;
7869}
7870
Eric Smitha9f7d622008-02-17 19:46:49 +00007871PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007872 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007873\n\
Eric Smith6c840852010-11-06 19:43:44 +00007874Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007875
Robert Schuppenies901c9972008-06-10 10:10:31 +00007876static PyObject *
7877unicode__sizeof__(PyUnicodeObject *v)
7878{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007879 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7880 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007881}
7882
7883PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007884 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007885\n\
7886");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007887
7888static PyObject *
7889unicode_getnewargs(PyUnicodeObject *v)
7890{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007891 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007892}
7893
7894
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007896 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007897 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7898 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007899 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007900 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7901 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7902 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7903 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7904 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7905 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7906 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007907 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007908 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7909 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7910 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007911 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007912 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007913/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7914 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7915 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7916 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007917 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007918 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007919 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007920 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007921 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7922 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7923 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7924 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7925 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7926 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7927 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7928 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7929 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7930 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7931 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7932 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7933 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7934 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007935 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007936 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7937 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7938 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7939 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007940 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007941#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007942 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943#endif
7944
7945#if 0
7946 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007947 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948#endif
7949
Benjamin Peterson857ce152009-01-31 16:29:18 +00007950 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 {NULL, NULL}
7952};
7953
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007954static PyObject *
7955unicode_mod(PyObject *v, PyObject *w)
7956{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007957 if (!PyUnicode_Check(v)) {
7958 Py_INCREF(Py_NotImplemented);
7959 return Py_NotImplemented;
7960 }
7961 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007962}
7963
7964static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007965 0, /*nb_add*/
7966 0, /*nb_subtract*/
7967 0, /*nb_multiply*/
7968 0, /*nb_divide*/
7969 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007970};
7971
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007973 (lenfunc) unicode_length, /* sq_length */
7974 PyUnicode_Concat, /* sq_concat */
7975 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7976 (ssizeargfunc) unicode_getitem, /* sq_item */
7977 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7978 0, /* sq_ass_item */
7979 0, /* sq_ass_slice */
7980 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981};
7982
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007983static PyObject*
7984unicode_subscript(PyUnicodeObject* self, PyObject* item)
7985{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007986 if (PyIndex_Check(item)) {
7987 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007988 if (i == -1 && PyErr_Occurred())
7989 return NULL;
7990 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007991 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007992 return unicode_getitem(self, i);
7993 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007994 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007995 Py_UNICODE* source_buf;
7996 Py_UNICODE* result_buf;
7997 PyObject* result;
7998
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007999 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008000 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008001 return NULL;
8002 }
8003
8004 if (slicelength <= 0) {
8005 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008006 } else if (start == 0 && step == 1 && slicelength == self->length &&
8007 PyUnicode_CheckExact(self)) {
8008 Py_INCREF(self);
8009 return (PyObject *)self;
8010 } else if (step == 1) {
8011 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008012 } else {
8013 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008014 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8015 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008016
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008017 if (result_buf == NULL)
8018 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008019
8020 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8021 result_buf[i] = source_buf[cur];
8022 }
Tim Petersced69f82003-09-16 20:30:58 +00008023
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008024 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008025 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008026 return result;
8027 }
8028 } else {
8029 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8030 return NULL;
8031 }
8032}
8033
8034static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008035 (lenfunc)unicode_length, /* mp_length */
8036 (binaryfunc)unicode_subscript, /* mp_subscript */
8037 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008038};
8039
Martin v. Löwis18e16552006-02-15 17:27:45 +00008040static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008042 Py_ssize_t index,
8043 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044{
8045 if (index != 0) {
8046 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008047 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 return -1;
8049 }
8050 *ptr = (void *) self->str;
8051 return PyUnicode_GET_DATA_SIZE(self);
8052}
8053
Martin v. Löwis18e16552006-02-15 17:27:45 +00008054static Py_ssize_t
8055unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008056 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057{
8058 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008059 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 return -1;
8061}
8062
8063static int
8064unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008065 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066{
8067 if (lenp)
8068 *lenp = PyUnicode_GET_DATA_SIZE(self);
8069 return 1;
8070}
8071
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008072static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008074 Py_ssize_t index,
8075 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076{
8077 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008078
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 if (index != 0) {
8080 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008081 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 return -1;
8083 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008084 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008086 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008087 *ptr = (void *) PyString_AS_STRING(str);
8088 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089}
8090
8091/* Helpers for PyUnicode_Format() */
8092
8093static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008094getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008096 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008098 (*p_argidx)++;
8099 if (arglen < 0)
8100 return args;
8101 else
8102 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 }
8104 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008105 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 return NULL;
8107}
8108
8109#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008110#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008112#define F_ALT (1<<3)
8113#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114
Martin v. Löwis18e16552006-02-15 17:27:45 +00008115static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008116strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008118 register Py_ssize_t i;
8119 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008121 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 return len;
8124}
8125
Neal Norwitzfc76d632006-01-10 06:03:13 +00008126static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008127longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8128{
Tim Peters15231542006-02-16 01:08:01 +00008129 Py_ssize_t result;
8130
Neal Norwitzfc76d632006-01-10 06:03:13 +00008131 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008132 result = strtounicode(buffer, (char *)buffer);
8133 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008134}
8135
Guido van Rossum078151d2002-08-11 04:24:12 +00008136/* XXX To save some code duplication, formatfloat/long/int could have been
8137 shared with stringobject.c, converting from 8-bit to Unicode after the
8138 formatting is done. */
8139
Mark Dickinson18cfada2009-11-23 18:46:41 +00008140/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8141
8142static PyObject *
8143formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008145 char *p;
8146 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008148
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 x = PyFloat_AsDouble(v);
8150 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008151 return NULL;
8152
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008154 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008155
Mark Dickinson18cfada2009-11-23 18:46:41 +00008156 p = PyOS_double_to_string(x, type, prec,
8157 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8158 if (p == NULL)
8159 return NULL;
8160 result = PyUnicode_FromStringAndSize(p, strlen(p));
8161 PyMem_Free(p);
8162 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163}
8164
Tim Peters38fd5b62000-09-21 05:43:11 +00008165static PyObject*
8166formatlong(PyObject *val, int flags, int prec, int type)
8167{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008168 char *buf;
8169 int i, len;
8170 PyObject *str; /* temporary string object. */
8171 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008172
Benjamin Peterson857ce152009-01-31 16:29:18 +00008173 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8174 if (!str)
8175 return NULL;
8176 result = _PyUnicode_New(len);
8177 if (!result) {
8178 Py_DECREF(str);
8179 return NULL;
8180 }
8181 for (i = 0; i < len; i++)
8182 result->str[i] = buf[i];
8183 result->str[len] = 0;
8184 Py_DECREF(str);
8185 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008186}
8187
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188static int
8189formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008190 size_t buflen,
8191 int flags,
8192 int prec,
8193 int type,
8194 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008196 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008197 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8198 * + 1 + 1
8199 * = 24
8200 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008201 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008202 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203 long x;
8204
8205 x = PyInt_AsLong(v);
8206 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008207 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008208 if (x < 0 && type == 'u') {
8209 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008210 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008211 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8212 sign = "-";
8213 else
8214 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008216 prec = 1;
8217
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008218 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8219 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008220 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008221 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008222 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008223 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008224 return -1;
8225 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008226
8227 if ((flags & F_ALT) &&
8228 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008229 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008230 * of issues that cause pain:
8231 * - when 0 is being converted, the C standard leaves off
8232 * the '0x' or '0X', which is inconsistent with other
8233 * %#x/%#X conversions and inconsistent with Python's
8234 * hex() function
8235 * - there are platforms that violate the standard and
8236 * convert 0 with the '0x' or '0X'
8237 * (Metrowerks, Compaq Tru64)
8238 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008239 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008240 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008241 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008242 * We can achieve the desired consistency by inserting our
8243 * own '0x' or '0X' prefix, and substituting %x/%X in place
8244 * of %#x/%#X.
8245 *
8246 * Note that this is the same approach as used in
8247 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008248 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008249 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8250 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008251 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008252 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008253 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8254 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008255 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008256 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008257 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008258 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008259 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008260 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261}
8262
8263static int
8264formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008265 size_t buflen,
8266 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267{
Ezio Melotti32125152010-02-25 17:36:04 +00008268 PyObject *unistr;
8269 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008270 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008271 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008272 if (PyUnicode_GET_SIZE(v) != 1)
8273 goto onError;
8274 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008275 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008277 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008278 if (PyString_GET_SIZE(v) != 1)
8279 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008280 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8281 with a UnicodeDecodeError if 'char' is not decodable with the
8282 default encoding (usually ASCII, but it might be something else) */
8283 str = PyString_AS_STRING(v);
8284 if ((unsigned char)str[0] > 0x7F) {
8285 /* the char is not ASCII; try to decode the string using the
8286 default encoding and return -1 to let the UnicodeDecodeError
8287 be raised if the string can't be decoded */
8288 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8289 if (unistr == NULL)
8290 return -1;
8291 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8292 Py_DECREF(unistr);
8293 }
8294 else
8295 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297
8298 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008299 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008301 x = PyInt_AsLong(v);
8302 if (x == -1 && PyErr_Occurred())
8303 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008304#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008305 if (x < 0 || x > 0x10ffff) {
8306 PyErr_SetString(PyExc_OverflowError,
8307 "%c arg not in range(0x110000) "
8308 "(wide Python build)");
8309 return -1;
8310 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008311#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008312 if (x < 0 || x > 0xffff) {
8313 PyErr_SetString(PyExc_OverflowError,
8314 "%c arg not in range(0x10000) "
8315 "(narrow Python build)");
8316 return -1;
8317 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008318#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008319 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320 }
8321 buf[1] = '\0';
8322 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008323
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008324 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008325 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008326 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008327 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328}
8329
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008330/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8331
Mark Dickinson18cfada2009-11-23 18:46:41 +00008332 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008333 chars are formatted. XXX This is a magic number. Each formatting
8334 routine does bounds checking to ensure no overflow, but a better
8335 solution may be to malloc a buffer of appropriate size for each
8336 format. For now, the current solution is sufficient.
8337*/
8338#define FORMATBUFLEN (size_t)120
8339
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008341 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342{
8343 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008344 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345 int args_owned = 0;
8346 PyUnicodeObject *result = NULL;
8347 PyObject *dict = NULL;
8348 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008349
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008351 PyErr_BadInternalCall();
8352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 }
8354 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008355 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008356 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357 fmt = PyUnicode_AS_UNICODE(uformat);
8358 fmtcnt = PyUnicode_GET_SIZE(uformat);
8359
8360 reslen = rescnt = fmtcnt + 100;
8361 result = _PyUnicode_New(reslen);
8362 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008363 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 res = PyUnicode_AS_UNICODE(result);
8365
8366 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008367 arglen = PyTuple_Size(args);
8368 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369 }
8370 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008371 arglen = -1;
8372 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 }
Benjamin Petersonda2c7eb2013-03-23 22:32:00 -05008374 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8375 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008376 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377
8378 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008379 if (*fmt != '%') {
8380 if (--rescnt < 0) {
8381 rescnt = fmtcnt + 100;
8382 reslen += rescnt;
8383 if (_PyUnicode_Resize(&result, reslen) < 0)
8384 goto onError;
8385 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8386 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008387 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008388 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008389 }
8390 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008391 /* Got a format specifier */
8392 int flags = 0;
8393 Py_ssize_t width = -1;
8394 int prec = -1;
8395 Py_UNICODE c = '\0';
8396 Py_UNICODE fill;
8397 int isnumok;
8398 PyObject *v = NULL;
8399 PyObject *temp = NULL;
8400 Py_UNICODE *pbuf;
8401 Py_UNICODE sign;
8402 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008403 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008404
8405 fmt++;
8406 if (*fmt == '(') {
8407 Py_UNICODE *keystart;
8408 Py_ssize_t keylen;
8409 PyObject *key;
8410 int pcount = 1;
8411
8412 if (dict == NULL) {
8413 PyErr_SetString(PyExc_TypeError,
8414 "format requires a mapping");
8415 goto onError;
8416 }
8417 ++fmt;
8418 --fmtcnt;
8419 keystart = fmt;
8420 /* Skip over balanced parentheses */
8421 while (pcount > 0 && --fmtcnt >= 0) {
8422 if (*fmt == ')')
8423 --pcount;
8424 else if (*fmt == '(')
8425 ++pcount;
8426 fmt++;
8427 }
8428 keylen = fmt - keystart - 1;
8429 if (fmtcnt < 0 || pcount > 0) {
8430 PyErr_SetString(PyExc_ValueError,
8431 "incomplete format key");
8432 goto onError;
8433 }
8434#if 0
8435 /* keys are converted to strings using UTF-8 and
8436 then looked up since Python uses strings to hold
8437 variables names etc. in its namespaces and we
8438 wouldn't want to break common idioms. */
8439 key = PyUnicode_EncodeUTF8(keystart,
8440 keylen,
8441 NULL);
8442#else
8443 key = PyUnicode_FromUnicode(keystart, keylen);
8444#endif
8445 if (key == NULL)
8446 goto onError;
8447 if (args_owned) {
8448 Py_DECREF(args);
8449 args_owned = 0;
8450 }
8451 args = PyObject_GetItem(dict, key);
8452 Py_DECREF(key);
8453 if (args == NULL) {
8454 goto onError;
8455 }
8456 args_owned = 1;
8457 arglen = -1;
8458 argidx = -2;
8459 }
8460 while (--fmtcnt >= 0) {
8461 switch (c = *fmt++) {
8462 case '-': flags |= F_LJUST; continue;
8463 case '+': flags |= F_SIGN; continue;
8464 case ' ': flags |= F_BLANK; continue;
8465 case '#': flags |= F_ALT; continue;
8466 case '0': flags |= F_ZERO; continue;
8467 }
8468 break;
8469 }
8470 if (c == '*') {
8471 v = getnextarg(args, arglen, &argidx);
8472 if (v == NULL)
8473 goto onError;
8474 if (!PyInt_Check(v)) {
8475 PyErr_SetString(PyExc_TypeError,
8476 "* wants int");
8477 goto onError;
8478 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008479 width = PyInt_AsSsize_t(v);
8480 if (width == -1 && PyErr_Occurred())
8481 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008482 if (width < 0) {
8483 flags |= F_LJUST;
8484 width = -width;
8485 }
8486 if (--fmtcnt >= 0)
8487 c = *fmt++;
8488 }
8489 else if (c >= '0' && c <= '9') {
8490 width = c - '0';
8491 while (--fmtcnt >= 0) {
8492 c = *fmt++;
8493 if (c < '0' || c > '9')
8494 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008495 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008496 PyErr_SetString(PyExc_ValueError,
8497 "width too big");
8498 goto onError;
8499 }
8500 width = width*10 + (c - '0');
8501 }
8502 }
8503 if (c == '.') {
8504 prec = 0;
8505 if (--fmtcnt >= 0)
8506 c = *fmt++;
8507 if (c == '*') {
8508 v = getnextarg(args, arglen, &argidx);
8509 if (v == NULL)
8510 goto onError;
8511 if (!PyInt_Check(v)) {
8512 PyErr_SetString(PyExc_TypeError,
8513 "* wants int");
8514 goto onError;
8515 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008516 prec = _PyInt_AsInt(v);
8517 if (prec == -1 && PyErr_Occurred())
8518 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008519 if (prec < 0)
8520 prec = 0;
8521 if (--fmtcnt >= 0)
8522 c = *fmt++;
8523 }
8524 else if (c >= '0' && c <= '9') {
8525 prec = c - '0';
8526 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008527 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008528 if (c < '0' || c > '9')
8529 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008530 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008531 PyErr_SetString(PyExc_ValueError,
8532 "prec too big");
8533 goto onError;
8534 }
8535 prec = prec*10 + (c - '0');
8536 }
8537 }
8538 } /* prec */
8539 if (fmtcnt >= 0) {
8540 if (c == 'h' || c == 'l' || c == 'L') {
8541 if (--fmtcnt >= 0)
8542 c = *fmt++;
8543 }
8544 }
8545 if (fmtcnt < 0) {
8546 PyErr_SetString(PyExc_ValueError,
8547 "incomplete format");
8548 goto onError;
8549 }
8550 if (c != '%') {
8551 v = getnextarg(args, arglen, &argidx);
8552 if (v == NULL)
8553 goto onError;
8554 }
8555 sign = 0;
8556 fill = ' ';
8557 switch (c) {
8558
8559 case '%':
8560 pbuf = formatbuf;
8561 /* presume that buffer length is at least 1 */
8562 pbuf[0] = '%';
8563 len = 1;
8564 break;
8565
8566 case 's':
8567 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008568 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008569 temp = v;
8570 Py_INCREF(temp);
8571 }
8572 else {
8573 PyObject *unicode;
8574 if (c == 's')
8575 temp = PyObject_Unicode(v);
8576 else
8577 temp = PyObject_Repr(v);
8578 if (temp == NULL)
8579 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008580 if (PyUnicode_Check(temp))
8581 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008582 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008583 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008584 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8585 PyString_GET_SIZE(temp),
8586 NULL,
8587 "strict");
8588 Py_DECREF(temp);
8589 temp = unicode;
8590 if (temp == NULL)
8591 goto onError;
8592 }
8593 else {
8594 Py_DECREF(temp);
8595 PyErr_SetString(PyExc_TypeError,
8596 "%s argument has non-string str()");
8597 goto onError;
8598 }
8599 }
8600 pbuf = PyUnicode_AS_UNICODE(temp);
8601 len = PyUnicode_GET_SIZE(temp);
8602 if (prec >= 0 && len > prec)
8603 len = prec;
8604 break;
8605
8606 case 'i':
8607 case 'd':
8608 case 'u':
8609 case 'o':
8610 case 'x':
8611 case 'X':
8612 if (c == 'i')
8613 c = 'd';
8614 isnumok = 0;
8615 if (PyNumber_Check(v)) {
8616 PyObject *iobj=NULL;
8617
8618 if (PyInt_Check(v) || (PyLong_Check(v))) {
8619 iobj = v;
8620 Py_INCREF(iobj);
8621 }
8622 else {
8623 iobj = PyNumber_Int(v);
8624 if (iobj==NULL) iobj = PyNumber_Long(v);
8625 }
8626 if (iobj!=NULL) {
8627 if (PyInt_Check(iobj)) {
8628 isnumok = 1;
8629 pbuf = formatbuf;
8630 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8631 flags, prec, c, iobj);
8632 Py_DECREF(iobj);
8633 if (len < 0)
8634 goto onError;
8635 sign = 1;
8636 }
8637 else if (PyLong_Check(iobj)) {
8638 isnumok = 1;
8639 temp = formatlong(iobj, flags, prec, c);
8640 Py_DECREF(iobj);
8641 if (!temp)
8642 goto onError;
8643 pbuf = PyUnicode_AS_UNICODE(temp);
8644 len = PyUnicode_GET_SIZE(temp);
8645 sign = 1;
8646 }
8647 else {
8648 Py_DECREF(iobj);
8649 }
8650 }
8651 }
8652 if (!isnumok) {
8653 PyErr_Format(PyExc_TypeError,
8654 "%%%c format: a number is required, "
8655 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8656 goto onError;
8657 }
8658 if (flags & F_ZERO)
8659 fill = '0';
8660 break;
8661
8662 case 'e':
8663 case 'E':
8664 case 'f':
8665 case 'F':
8666 case 'g':
8667 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008668 temp = formatfloat(v, flags, prec, c);
8669 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008670 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008671 pbuf = PyUnicode_AS_UNICODE(temp);
8672 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008673 sign = 1;
8674 if (flags & F_ZERO)
8675 fill = '0';
8676 break;
8677
8678 case 'c':
8679 pbuf = formatbuf;
8680 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8681 if (len < 0)
8682 goto onError;
8683 break;
8684
8685 default:
8686 PyErr_Format(PyExc_ValueError,
8687 "unsupported format character '%c' (0x%x) "
8688 "at index %zd",
8689 (31<=c && c<=126) ? (char)c : '?',
8690 (int)c,
8691 (Py_ssize_t)(fmt - 1 -
8692 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008693 goto onError;
8694 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008695 if (sign) {
8696 if (*pbuf == '-' || *pbuf == '+') {
8697 sign = *pbuf++;
8698 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008699 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008700 else if (flags & F_SIGN)
8701 sign = '+';
8702 else if (flags & F_BLANK)
8703 sign = ' ';
8704 else
8705 sign = 0;
8706 }
8707 if (width < len)
8708 width = len;
8709 if (rescnt - (sign != 0) < width) {
8710 reslen -= rescnt;
8711 rescnt = width + fmtcnt + 100;
8712 reslen += rescnt;
8713 if (reslen < 0) {
8714 Py_XDECREF(temp);
8715 PyErr_NoMemory();
8716 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008717 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008718 if (_PyUnicode_Resize(&result, reslen) < 0) {
8719 Py_XDECREF(temp);
8720 goto onError;
8721 }
8722 res = PyUnicode_AS_UNICODE(result)
8723 + reslen - rescnt;
8724 }
8725 if (sign) {
8726 if (fill != ' ')
8727 *res++ = sign;
8728 rescnt--;
8729 if (width > len)
8730 width--;
8731 }
8732 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8733 assert(pbuf[0] == '0');
8734 assert(pbuf[1] == c);
8735 if (fill != ' ') {
8736 *res++ = *pbuf++;
8737 *res++ = *pbuf++;
8738 }
8739 rescnt -= 2;
8740 width -= 2;
8741 if (width < 0)
8742 width = 0;
8743 len -= 2;
8744 }
8745 if (width > len && !(flags & F_LJUST)) {
8746 do {
8747 --rescnt;
8748 *res++ = fill;
8749 } while (--width > len);
8750 }
8751 if (fill == ' ') {
8752 if (sign)
8753 *res++ = sign;
8754 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8755 assert(pbuf[0] == '0');
8756 assert(pbuf[1] == c);
8757 *res++ = *pbuf++;
8758 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008759 }
8760 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008761 Py_UNICODE_COPY(res, pbuf, len);
8762 res += len;
8763 rescnt -= len;
8764 while (--width >= len) {
8765 --rescnt;
8766 *res++ = ' ';
8767 }
8768 if (dict && (argidx < arglen) && c != '%') {
8769 PyErr_SetString(PyExc_TypeError,
8770 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008771 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008772 goto onError;
8773 }
8774 Py_XDECREF(temp);
8775 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776 } /* until end */
8777 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008778 PyErr_SetString(PyExc_TypeError,
8779 "not all arguments converted during string formatting");
8780 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781 }
8782
Thomas Woutersa96affe2006-03-12 00:29:36 +00008783 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008784 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008786 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 }
8788 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 return (PyObject *)result;
8790
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008791 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792 Py_XDECREF(result);
8793 Py_DECREF(uformat);
8794 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008795 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 }
8797 return NULL;
8798}
8799
8800static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008801 (readbufferproc) unicode_buffer_getreadbuf,
8802 (writebufferproc) unicode_buffer_getwritebuf,
8803 (segcountproc) unicode_buffer_getsegcount,
8804 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805};
8806
Jeremy Hylton938ace62002-07-17 16:30:39 +00008807static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008808unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8809
Tim Peters6d6c1a32001-08-02 04:15:00 +00008810static PyObject *
8811unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8812{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008813 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008814 static char *kwlist[] = {"string", "encoding", "errors", 0};
8815 char *encoding = NULL;
8816 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008817
Benjamin Peterson857ce152009-01-31 16:29:18 +00008818 if (type != &PyUnicode_Type)
8819 return unicode_subtype_new(type, args, kwds);
8820 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008821 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008822 return NULL;
8823 if (x == NULL)
8824 return (PyObject *)_PyUnicode_New(0);
8825 if (encoding == NULL && errors == NULL)
8826 return PyObject_Unicode(x);
8827 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008828 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008829}
8830
Guido van Rossume023fe02001-08-30 03:12:59 +00008831static PyObject *
8832unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8833{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008834 PyUnicodeObject *tmp, *pnew;
8835 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008836
Benjamin Peterson857ce152009-01-31 16:29:18 +00008837 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8838 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8839 if (tmp == NULL)
8840 return NULL;
8841 assert(PyUnicode_Check(tmp));
8842 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8843 if (pnew == NULL) {
8844 Py_DECREF(tmp);
8845 return NULL;
8846 }
8847 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8848 if (pnew->str == NULL) {
8849 _Py_ForgetReference((PyObject *)pnew);
8850 PyObject_Del(pnew);
8851 Py_DECREF(tmp);
8852 return PyErr_NoMemory();
8853 }
8854 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8855 pnew->length = n;
8856 pnew->hash = tmp->hash;
8857 Py_DECREF(tmp);
8858 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008859}
8860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008861PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008862 "unicode(object='') -> unicode object\n\
8863unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008864\n\
8865Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008866encoding defaults to the current default string encoding.\n\
8867errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008868
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008870 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008871 "unicode", /* tp_name */
8872 sizeof(PyUnicodeObject), /* tp_size */
8873 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008875 (destructor)unicode_dealloc, /* tp_dealloc */
8876 0, /* tp_print */
8877 0, /* tp_getattr */
8878 0, /* tp_setattr */
8879 0, /* tp_compare */
8880 unicode_repr, /* tp_repr */
8881 &unicode_as_number, /* tp_as_number */
8882 &unicode_as_sequence, /* tp_as_sequence */
8883 &unicode_as_mapping, /* tp_as_mapping */
8884 (hashfunc) unicode_hash, /* tp_hash*/
8885 0, /* tp_call*/
8886 (reprfunc) unicode_str, /* tp_str */
8887 PyObject_GenericGetAttr, /* tp_getattro */
8888 0, /* tp_setattro */
8889 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008890 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008891 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008892 unicode_doc, /* tp_doc */
8893 0, /* tp_traverse */
8894 0, /* tp_clear */
8895 PyUnicode_RichCompare, /* tp_richcompare */
8896 0, /* tp_weaklistoffset */
8897 0, /* tp_iter */
8898 0, /* tp_iternext */
8899 unicode_methods, /* tp_methods */
8900 0, /* tp_members */
8901 0, /* tp_getset */
8902 &PyBaseString_Type, /* tp_base */
8903 0, /* tp_dict */
8904 0, /* tp_descr_get */
8905 0, /* tp_descr_set */
8906 0, /* tp_dictoffset */
8907 0, /* tp_init */
8908 0, /* tp_alloc */
8909 unicode_new, /* tp_new */
8910 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911};
8912
8913/* Initialize the Unicode implementation */
8914
Thomas Wouters78890102000-07-22 19:25:51 +00008915void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008917 /* XXX - move this array to unicodectype.c ? */
8918 Py_UNICODE linebreak[] = {
8919 0x000A, /* LINE FEED */
8920 0x000D, /* CARRIAGE RETURN */
8921 0x001C, /* FILE SEPARATOR */
8922 0x001D, /* GROUP SEPARATOR */
8923 0x001E, /* RECORD SEPARATOR */
8924 0x0085, /* NEXT LINE */
8925 0x2028, /* LINE SEPARATOR */
8926 0x2029, /* PARAGRAPH SEPARATOR */
8927 };
8928
Fred Drakee4315f52000-05-09 19:53:39 +00008929 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008930 if (!unicode_empty) {
8931 unicode_empty = _PyUnicode_New(0);
8932 if (!unicode_empty)
8933 return;
8934 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008935
Guido van Rossumcacfc072002-05-24 19:01:59 +00008936 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008937 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008938
8939 /* initialize the linebreak bloom filter */
8940 bloom_linebreak = make_bloom_mask(
8941 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8942 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008943
8944 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008945
8946 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8947 Py_FatalError("Can't initialize field name iterator type");
8948
8949 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8950 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951}
8952
8953/* Finalize the Unicode implementation */
8954
Christian Heimes3b718a72008-02-14 12:47:33 +00008955int
8956PyUnicode_ClearFreeList(void)
8957{
8958 int freelist_size = numfree;
8959 PyUnicodeObject *u;
8960
8961 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008962 PyUnicodeObject *v = u;
8963 u = *(PyUnicodeObject **)u;
8964 if (v->str)
8965 PyObject_DEL(v->str);
8966 Py_XDECREF(v->defenc);
8967 PyObject_Del(v);
8968 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008969 }
8970 free_list = NULL;
8971 assert(numfree == 0);
8972 return freelist_size;
8973}
8974
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975void
Thomas Wouters78890102000-07-22 19:25:51 +00008976_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008978 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008980 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008981
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008982 for (i = 0; i < 256; i++)
8983 Py_CLEAR(unicode_latin1[i]);
8984
Christian Heimes3b718a72008-02-14 12:47:33 +00008985 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008987
Anthony Baxterac6bd462006-04-13 02:06:09 +00008988#ifdef __cplusplus
8989}
8990#endif