blob: 6c46263222bf178b4840fbdec9606b60ee973548 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Serhiy Storchakae822b032013-08-06 16:56:26 +0300550/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551 * by 'ptr', possibly combining surrogate pairs on narrow builds.
552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553 * that should be returned and 'end' pointing to the end of the buffer.
554 * ('end' is used on narrow builds to detect a lone surrogate at the
555 * end of the buffer that should be returned unchanged.)
556 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557 * The type of the returned char is always Py_UCS4.
558 *
559 * Note: the macro advances ptr to next char, so it might have side-effects
560 * (especially if used with other macros).
561 */
562
563/* helper macros used by _Py_UNICODE_NEXT */
564#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566/* Join two surrogate characters and return a single Py_UCS4 value. */
567#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
570
571#ifdef Py_UNICODE_WIDE
572#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
573#else
574#define _Py_UNICODE_NEXT(ptr, end) \
575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
578 (Py_UCS4)*(ptr)++)
579#endif
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581#ifdef HAVE_WCHAR_H
582
Mark Dickinson6b265f12009-03-18 16:07:26 +0000583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594{
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000643 PyErr_BadInternalCall();
644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 }
646
647 unicode = _PyUnicode_New(size);
648 if (!unicode)
649 return NULL;
650
651 /* Copy the wchar_t data into the new object */
652#ifdef HAVE_USABLE_WCHAR_T
653 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000654#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000656 register Py_UNICODE *u;
657 register Py_ssize_t i;
658 u = PyUnicode_AS_UNICODE(unicode);
659 for (i = size; i > 0; i--)
660 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662#endif
663
664 return (PyObject *)unicode;
665}
666
Mark Dickinson6b265f12009-03-18 16:07:26 +0000667#endif /* CONVERT_WCHAR_TO_SURROGATES */
668
669#undef CONVERT_WCHAR_TO_SURROGATES
670
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000671static void
672makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
673{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000674 *fmt++ = '%';
675 if (width) {
676 if (zeropad)
677 *fmt++ = '0';
678 fmt += sprintf(fmt, "%d", width);
679 }
680 if (precision)
681 fmt += sprintf(fmt, ".%d", precision);
682 if (longflag)
683 *fmt++ = 'l';
684 else if (size_tflag) {
685 char *f = PY_FORMAT_SIZE_T;
686 while (*f)
687 *fmt++ = *f++;
688 }
689 *fmt++ = c;
690 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000691}
692
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200693#define appendstring(string) \
694 do { \
695 for (copy = string;*copy; copy++) { \
696 *s++ = (unsigned char)*copy; \
697 } \
698 } while (0)
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000699
700PyObject *
701PyUnicode_FromFormatV(const char *format, va_list vargs)
702{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000703 va_list count;
704 Py_ssize_t callcount = 0;
705 PyObject **callresults = NULL;
706 PyObject **callresult = NULL;
707 Py_ssize_t n = 0;
708 int width = 0;
709 int precision = 0;
710 int zeropad;
711 const char* f;
712 Py_UNICODE *s;
713 PyObject *string;
714 /* used by sprintf */
715 char buffer[21];
716 /* use abuffer instead of buffer, if we need more space
717 * (which can happen if there's a format specifier with width). */
718 char *abuffer = NULL;
719 char *realbuffer;
720 Py_ssize_t abuffersize = 0;
721 char fmt[60]; /* should be enough for %0width.precisionld */
722 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000723
724#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726#else
727#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000728 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000729#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000730 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000731#endif
732#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000733 /* step 1: count the number of %S/%R/%s format specifications
734 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
735 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000736 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000737 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200738 f++;
739 while (*f && *f != '%' && !isalpha((unsigned)*f))
740 f++;
Serhiy Storchaka227526d2015-01-31 01:15:29 +0200741 if (!*f)
742 break;
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200743 if (*f == 's' || *f=='S' || *f=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000744 ++callcount;
745 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000746 }
747 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000748 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000749 if (callcount) {
750 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
751 if (!callresults) {
752 PyErr_NoMemory();
753 return NULL;
754 }
755 callresult = callresults;
756 }
757 /* step 3: figure out how large a buffer we need */
758 for (f = format; *f; f++) {
759 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200760 const char* p = f++;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000761 width = 0;
762 while (isdigit((unsigned)*f))
763 width = (width*10) + *f++ - '0';
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200764 precision = 0;
765 if (*f == '.') {
766 f++;
767 while (isdigit((unsigned)*f))
768 precision = (precision*10) + *f++ - '0';
769 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000770
Benjamin Peterson857ce152009-01-31 16:29:18 +0000771 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
772 * they don't affect the amount of space we reserve.
773 */
774 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000775 (f[1] == 'd' || f[1] == 'u'))
776 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000777
Benjamin Peterson857ce152009-01-31 16:29:18 +0000778 switch (*f) {
779 case 'c':
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300780 {
781 int ordinal = va_arg(count, int);
782#ifdef Py_UNICODE_WIDE
783 if (ordinal < 0 || ordinal > 0x10ffff) {
784 PyErr_SetString(PyExc_OverflowError,
785 "%c arg not in range(0x110000) "
786 "(wide Python build)");
787 goto fail;
788 }
789#else
790 if (ordinal < 0 || ordinal > 0xffff) {
791 PyErr_SetString(PyExc_OverflowError,
792 "%c arg not in range(0x10000) "
793 "(narrow Python build)");
794 goto fail;
795 }
796#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000797 /* fall through... */
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300798 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000799 case '%':
800 n++;
801 break;
802 case 'd': case 'u': case 'i': case 'x':
803 (void) va_arg(count, int);
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200804 if (width < precision)
805 width = precision;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000806 /* 20 bytes is enough to hold a 64-bit
807 integer. Decimal takes the most space.
808 This isn't enough for octal.
809 If a width is specified we need more
810 (which we allocate later). */
811 if (width < 20)
812 width = 20;
813 n += width;
814 if (abuffersize < width)
815 abuffersize = width;
816 break;
817 case 's':
818 {
819 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000820 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000821 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
822 if (!str)
823 goto fail;
824 n += PyUnicode_GET_SIZE(str);
825 /* Remember the str and switch to the next slot */
826 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000827 break;
828 }
829 case 'U':
830 {
831 PyObject *obj = va_arg(count, PyObject *);
832 assert(obj && PyUnicode_Check(obj));
833 n += PyUnicode_GET_SIZE(obj);
834 break;
835 }
836 case 'V':
837 {
838 PyObject *obj = va_arg(count, PyObject *);
839 const char *str = va_arg(count, const char *);
840 assert(obj || str);
841 assert(!obj || PyUnicode_Check(obj));
842 if (obj)
843 n += PyUnicode_GET_SIZE(obj);
844 else
845 n += strlen(str);
846 break;
847 }
848 case 'S':
849 {
850 PyObject *obj = va_arg(count, PyObject *);
851 PyObject *str;
852 assert(obj);
853 str = PyObject_Str(obj);
854 if (!str)
855 goto fail;
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200856 n += PyString_GET_SIZE(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000857 /* Remember the str and switch to the next slot */
858 *callresult++ = str;
859 break;
860 }
861 case 'R':
862 {
863 PyObject *obj = va_arg(count, PyObject *);
864 PyObject *repr;
865 assert(obj);
866 repr = PyObject_Repr(obj);
867 if (!repr)
868 goto fail;
869 n += PyUnicode_GET_SIZE(repr);
870 /* Remember the repr and switch to the next slot */
871 *callresult++ = repr;
872 break;
873 }
874 case 'p':
875 (void) va_arg(count, int);
876 /* maximum 64-bit pointer representation:
877 * 0xffffffffffffffff
878 * so 19 characters is enough.
879 * XXX I count 18 -- what's the extra for?
880 */
881 n += 19;
882 break;
883 default:
884 /* if we stumble upon an unknown
885 formatting code, copy the rest of
886 the format string to the output
887 string. (we cannot just skip the
888 code, since there's no way to know
889 what's in the argument list) */
890 n += strlen(p);
891 goto expand;
892 }
893 } else
894 n++;
895 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000896 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000897 if (abuffersize > 20) {
Serhiy Storchaka5ec0bbf2015-01-30 23:35:03 +0200898 /* add 1 for sprintf's trailing null byte */
899 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000900 if (!abuffer) {
901 PyErr_NoMemory();
902 goto fail;
903 }
904 realbuffer = abuffer;
905 }
906 else
907 realbuffer = buffer;
908 /* step 4: fill the buffer */
909 /* Since we've analyzed how much space we need for the worst case,
910 we don't have to resize the string.
911 There can be no errors beyond this point. */
912 string = PyUnicode_FromUnicode(NULL, n);
913 if (!string)
914 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000915
Benjamin Peterson857ce152009-01-31 16:29:18 +0000916 s = PyUnicode_AS_UNICODE(string);
917 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000918
Benjamin Peterson857ce152009-01-31 16:29:18 +0000919 for (f = format; *f; f++) {
920 if (*f == '%') {
921 const char* p = f++;
922 int longflag = 0;
923 int size_tflag = 0;
924 zeropad = (*f == '0');
925 /* parse the width.precision part */
926 width = 0;
927 while (isdigit((unsigned)*f))
928 width = (width*10) + *f++ - '0';
929 precision = 0;
930 if (*f == '.') {
931 f++;
932 while (isdigit((unsigned)*f))
933 precision = (precision*10) + *f++ - '0';
934 }
935 /* handle the long flag, but only for %ld and %lu.
936 others can be added when necessary. */
937 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
938 longflag = 1;
939 ++f;
940 }
941 /* handle the size_t flag. */
942 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
943 size_tflag = 1;
944 ++f;
945 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000946
Benjamin Peterson857ce152009-01-31 16:29:18 +0000947 switch (*f) {
948 case 'c':
949 *s++ = va_arg(vargs, int);
950 break;
951 case 'd':
952 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
953 if (longflag)
954 sprintf(realbuffer, fmt, va_arg(vargs, long));
955 else if (size_tflag)
956 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
957 else
958 sprintf(realbuffer, fmt, va_arg(vargs, int));
959 appendstring(realbuffer);
960 break;
961 case 'u':
962 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
963 if (longflag)
964 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
965 else if (size_tflag)
966 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
967 else
968 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
969 appendstring(realbuffer);
970 break;
971 case 'i':
972 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
973 sprintf(realbuffer, fmt, va_arg(vargs, int));
974 appendstring(realbuffer);
975 break;
976 case 'x':
977 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
978 sprintf(realbuffer, fmt, va_arg(vargs, int));
979 appendstring(realbuffer);
980 break;
981 case 's':
982 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000983 /* unused, since we already have the result */
984 (void) va_arg(vargs, char *);
985 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
986 PyUnicode_GET_SIZE(*callresult));
987 s += PyUnicode_GET_SIZE(*callresult);
988 /* We're done with the unicode()/repr() => forget it */
989 Py_DECREF(*callresult);
990 /* switch to next unicode()/repr() result */
991 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000992 break;
993 }
994 case 'U':
995 {
996 PyObject *obj = va_arg(vargs, PyObject *);
997 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
998 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
999 s += size;
1000 break;
1001 }
1002 case 'V':
1003 {
1004 PyObject *obj = va_arg(vargs, PyObject *);
1005 const char *str = va_arg(vargs, const char *);
1006 if (obj) {
1007 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1008 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1009 s += size;
1010 } else {
1011 appendstring(str);
1012 }
1013 break;
1014 }
1015 case 'S':
1016 case 'R':
1017 {
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001018 const char *str = PyString_AS_STRING(*callresult);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001019 /* unused, since we already have the result */
1020 (void) va_arg(vargs, PyObject *);
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001021 appendstring(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 /* We're done with the unicode()/repr() => forget it */
1023 Py_DECREF(*callresult);
1024 /* switch to next unicode()/repr() result */
1025 ++callresult;
1026 break;
1027 }
1028 case 'p':
1029 sprintf(buffer, "%p", va_arg(vargs, void*));
1030 /* %p is ill-defined: ensure leading 0x. */
1031 if (buffer[1] == 'X')
1032 buffer[1] = 'x';
1033 else if (buffer[1] != 'x') {
1034 memmove(buffer+2, buffer, strlen(buffer)+1);
1035 buffer[0] = '0';
1036 buffer[1] = 'x';
1037 }
1038 appendstring(buffer);
1039 break;
1040 case '%':
1041 *s++ = '%';
1042 break;
1043 default:
1044 appendstring(p);
1045 goto end;
1046 }
1047 } else
1048 *s++ = *f;
1049 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001050
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001051 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001052 if (callresults)
1053 PyObject_Free(callresults);
1054 if (abuffer)
1055 PyObject_Free(abuffer);
1056 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1057 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001058 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001059 if (callresults) {
1060 PyObject **callresult2 = callresults;
1061 while (callresult2 < callresult) {
1062 Py_DECREF(*callresult2);
1063 ++callresult2;
1064 }
1065 PyObject_Free(callresults);
1066 }
1067 if (abuffer)
1068 PyObject_Free(abuffer);
1069 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001070}
1071
1072#undef appendstring
1073
1074PyObject *
1075PyUnicode_FromFormat(const char *format, ...)
1076{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001077 PyObject* ret;
1078 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001079
1080#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001081 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001082#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001083 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001084#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001085 ret = PyUnicode_FromFormatV(format, vargs);
1086 va_end(vargs);
1087 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001088}
1089
Martin v. Löwis18e16552006-02-15 17:27:45 +00001090Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 wchar_t *w,
1092 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093{
1094 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001095 PyErr_BadInternalCall();
1096 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001098
1099 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001101 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001102
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103#ifdef HAVE_USABLE_WCHAR_T
1104 memcpy(w, unicode->str, size * sizeof(wchar_t));
1105#else
1106 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001107 register Py_UNICODE *u;
1108 register Py_ssize_t i;
1109 u = PyUnicode_AS_UNICODE(unicode);
1110 for (i = size; i > 0; i--)
1111 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 }
1113#endif
1114
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001115 if (size > PyUnicode_GET_SIZE(unicode))
1116 return PyUnicode_GET_SIZE(unicode);
1117 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001118 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119}
1120
1121#endif
1122
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001123PyObject *PyUnicode_FromOrdinal(int ordinal)
1124{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001125 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001126
1127#ifdef Py_UNICODE_WIDE
1128 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001129 PyErr_SetString(PyExc_ValueError,
1130 "unichr() arg not in range(0x110000) "
1131 "(wide Python build)");
1132 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001133 }
1134#else
1135 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001136 PyErr_SetString(PyExc_ValueError,
1137 "unichr() arg not in range(0x10000) "
1138 "(narrow Python build)");
1139 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001140 }
1141#endif
1142
Hye-Shik Chang40574832004-04-06 07:24:51 +00001143 s[0] = (Py_UNICODE)ordinal;
1144 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001145}
1146
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147PyObject *PyUnicode_FromObject(register PyObject *obj)
1148{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001149 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001150 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001151 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001152 Py_INCREF(obj);
1153 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001154 }
1155 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001156 /* For a Unicode subtype that's not a Unicode object,
1157 return a true Unicode object with the same data. */
1158 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1159 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001160 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001161 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1162}
1163
1164PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001165 const char *encoding,
1166 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001167{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001168 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001169 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001170 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001171
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001173 PyErr_BadInternalCall();
1174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001176
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001177#if 0
1178 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001179 that no encodings is given and then redirect to
1180 PyObject_Unicode() which then applies the additional logic for
1181 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001182
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001183 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001184 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001185
1186 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001187 if (PyUnicode_Check(obj)) {
1188 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001189 PyErr_SetString(PyExc_TypeError,
1190 "decoding Unicode is not supported");
1191 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001192 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001193 return PyObject_Unicode(obj);
1194 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001195#else
1196 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001197 PyErr_SetString(PyExc_TypeError,
1198 "decoding Unicode is not supported");
1199 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001200 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001201#endif
1202
1203 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001204 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001205 s = PyString_AS_STRING(obj);
1206 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001207 }
Christian Heimes3497f942008-05-26 12:29:14 +00001208 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001209 /* Python 2.x specific */
1210 PyErr_Format(PyExc_TypeError,
1211 "decoding bytearray is not supported");
1212 return NULL;
1213 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001214 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001215 /* Overwrite the error message with something more useful in
1216 case of a TypeError. */
1217 if (PyErr_ExceptionMatches(PyExc_TypeError))
1218 PyErr_Format(PyExc_TypeError,
1219 "coercing to Unicode: need string or buffer, "
1220 "%.80s found",
1221 Py_TYPE(obj)->tp_name);
1222 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001223 }
Tim Petersced69f82003-09-16 20:30:58 +00001224
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001225 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001226 if (len == 0)
1227 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001228
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001229 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001230 return v;
1231
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001232 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001233 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234}
1235
1236PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001237 Py_ssize_t size,
1238 const char *encoding,
1239 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240{
1241 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001242
1243 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001244 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001245
1246 /* Shortcuts for common default encodings */
1247 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001249 else if (strcmp(encoding, "latin-1") == 0)
1250 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001251#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1252 else if (strcmp(encoding, "mbcs") == 0)
1253 return PyUnicode_DecodeMBCS(s, size, errors);
1254#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001255 else if (strcmp(encoding, "ascii") == 0)
1256 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001257
1258 /* Decode via the codec registry */
1259 buffer = PyBuffer_FromMemory((void *)s, size);
1260 if (buffer == NULL)
1261 goto onError;
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001262 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 if (unicode == NULL)
1264 goto onError;
1265 if (!PyUnicode_Check(unicode)) {
1266 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001267 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001268 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269 Py_DECREF(unicode);
1270 goto onError;
1271 }
1272 Py_DECREF(buffer);
1273 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001274
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001275 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 Py_XDECREF(buffer);
1277 return NULL;
1278}
1279
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001280PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1281 const char *encoding,
1282 const char *errors)
1283{
1284 PyObject *v;
1285
1286 if (!PyUnicode_Check(unicode)) {
1287 PyErr_BadArgument();
1288 goto onError;
1289 }
1290
1291 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001292 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001293
1294 /* Decode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001295 v = _PyCodec_DecodeText(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001296 if (v == NULL)
1297 goto onError;
1298 return v;
1299
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001300 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001301 return NULL;
1302}
1303
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001305 Py_ssize_t size,
1306 const char *encoding,
1307 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308{
1309 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001310
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 unicode = PyUnicode_FromUnicode(s, size);
1312 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1315 Py_DECREF(unicode);
1316 return v;
1317}
1318
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001319PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1320 const char *encoding,
1321 const char *errors)
1322{
1323 PyObject *v;
1324
1325 if (!PyUnicode_Check(unicode)) {
1326 PyErr_BadArgument();
1327 goto onError;
1328 }
1329
1330 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001331 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001332
1333 /* Encode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001334 v = _PyCodec_EncodeText(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001335 if (v == NULL)
1336 goto onError;
1337 return v;
1338
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001339 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001340 return NULL;
1341}
1342
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1344 const char *encoding,
1345 const char *errors)
1346{
1347 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001348
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 if (!PyUnicode_Check(unicode)) {
1350 PyErr_BadArgument();
1351 goto onError;
1352 }
Fred Drakee4315f52000-05-09 19:53:39 +00001353
Tim Petersced69f82003-09-16 20:30:58 +00001354 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001355 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001356
1357 /* Shortcuts for common default encodings */
1358 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001359 if (strcmp(encoding, "utf-8") == 0)
1360 return PyUnicode_AsUTF8String(unicode);
1361 else if (strcmp(encoding, "latin-1") == 0)
1362 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001363#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001364 else if (strcmp(encoding, "mbcs") == 0)
1365 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001366#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001367 else if (strcmp(encoding, "ascii") == 0)
1368 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370
1371 /* Encode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001372 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373 if (v == NULL)
1374 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001375 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001377 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001378 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379 Py_DECREF(v);
1380 goto onError;
1381 }
1382 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001383
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001384 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385 return NULL;
1386}
1387
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001388PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001390{
1391 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1392
1393 if (v)
1394 return v;
1395 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1396 if (v && errors == NULL)
1397 ((PyUnicodeObject *)unicode)->defenc = v;
1398 return v;
1399}
1400
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1402{
1403 if (!PyUnicode_Check(unicode)) {
1404 PyErr_BadArgument();
1405 goto onError;
1406 }
1407 return PyUnicode_AS_UNICODE(unicode);
1408
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001409 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410 return NULL;
1411}
1412
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414{
1415 if (!PyUnicode_Check(unicode)) {
1416 PyErr_BadArgument();
1417 goto onError;
1418 }
1419 return PyUnicode_GET_SIZE(unicode);
1420
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001421 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422 return -1;
1423}
1424
Thomas Wouters78890102000-07-22 19:25:51 +00001425const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001426{
1427 return unicode_default_encoding;
1428}
1429
1430int PyUnicode_SetDefaultEncoding(const char *encoding)
1431{
1432 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001433
Fred Drakee4315f52000-05-09 19:53:39 +00001434 /* Make sure the encoding is valid. As side effect, this also
1435 loads the encoding into the codec registry cache. */
1436 v = _PyCodec_Lookup(encoding);
1437 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001438 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001439 Py_DECREF(v);
1440 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001442 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001443 return 0;
1444
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001445 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001446 return -1;
1447}
1448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449/* error handling callback helper:
1450 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001451 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452 and adjust various state variables.
1453 return 0 on success, -1 on error
1454*/
1455
1456static
1457int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001458 const char *encoding, const char *reason,
1459 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1460 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1461 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001462{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001463 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001464
1465 PyObject *restuple = NULL;
1466 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1468 Py_ssize_t requiredsize;
1469 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001470 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001471 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 int res = -1;
1473
1474 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001475 *errorHandler = PyCodec_LookupError(errors);
1476 if (*errorHandler == NULL)
1477 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001478 }
1479
1480 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001481 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001482 encoding, input, insize, *startinpos, *endinpos, reason);
1483 if (*exceptionObject == NULL)
1484 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001485 }
1486 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001487 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1488 goto onError;
1489 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1490 goto onError;
1491 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1492 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001493 }
1494
1495 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1496 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001497 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001499 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001500 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001501 }
1502 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001503 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001505 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001506 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001507 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1508 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001509 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001510
1511 /* need more space? (at least enough for what we
1512 have+the replacement+the rest of the string (starting
1513 at the new input position), so we won't have to check space
1514 when there are no errors in the rest of the string) */
1515 repptr = PyUnicode_AS_UNICODE(repunicode);
1516 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001517 requiredsize = *outpos;
1518 if (requiredsize > PY_SSIZE_T_MAX - repsize)
1519 goto overflow;
1520 requiredsize += repsize;
1521 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1522 goto overflow;
1523 requiredsize += insize - newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 if (requiredsize > outsize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001525 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001526 requiredsize = 2*outsize;
1527 if (_PyUnicode_Resize(output, requiredsize) < 0)
1528 goto onError;
1529 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 }
1531 *endinpos = newpos;
1532 *inptr = input + newpos;
1533 Py_UNICODE_COPY(*outptr, repptr, repsize);
1534 *outptr += repsize;
1535 *outpos += repsize;
1536 /* we made it! */
1537 res = 0;
1538
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001539 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 Py_XDECREF(restuple);
1541 return res;
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001542
1543 overflow:
1544 PyErr_SetString(PyExc_OverflowError,
1545 "decoded result is too long for a Python string");
1546 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547}
1548
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549/* --- UTF-7 Codec -------------------------------------------------------- */
1550
Antoine Pitrou653dece2009-05-04 18:32:32 +00001551/* See RFC2152 for details. We encode conservatively and decode liberally. */
1552
1553/* Three simple macros defining base-64. */
1554
1555/* Is c a base-64 character? */
1556
1557#define IS_BASE64(c) \
1558 (isalnum(c) || (c) == '+' || (c) == '/')
1559
1560/* given that c is a base-64 character, what is its base-64 value? */
1561
1562#define FROM_BASE64(c) \
1563 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1564 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1565 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1566 (c) == '+' ? 62 : 63)
1567
1568/* What is the base-64 character of the bottom 6 bits of n? */
1569
1570#define TO_BASE64(n) \
1571 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1572
1573/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1574 * decoded as itself. We are permissive on decoding; the only ASCII
1575 * byte not decoding to itself is the + which begins a base64
1576 * string. */
1577
1578#define DECODE_DIRECT(c) \
1579 ((c) <= 127 && (c) != '+')
1580
1581/* The UTF-7 encoder treats ASCII characters differently according to
1582 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1583 * the above). See RFC2152. This array identifies these different
1584 * sets:
1585 * 0 : "Set D"
1586 * alphanumeric and '(),-./:?
1587 * 1 : "Set O"
1588 * !"#$%&*;<=>@[]^_`{|}
1589 * 2 : "whitespace"
1590 * ht nl cr sp
1591 * 3 : special (must be base64 encoded)
1592 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1593 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594
Tim Petersced69f82003-09-16 20:30:58 +00001595static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001596char utf7_category[128] = {
1597/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1598 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1599/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1600 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1601/* sp ! " # $ % & ' ( ) * + , - . / */
1602 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1603/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1604 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1605/* @ A B C D E F G H I J K L M N O */
1606 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1607/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1608 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1609/* ` a b c d e f g h i j k l m n o */
1610 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1611/* p q r s t u v w x y z { | } ~ del */
1612 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613};
1614
Antoine Pitrou653dece2009-05-04 18:32:32 +00001615/* ENCODE_DIRECT: this character should be encoded as itself. The
1616 * answer depends on whether we are encoding set O as itself, and also
1617 * on whether we are encoding whitespace as itself. RFC2152 makes it
1618 * clear that the answers to these questions vary between
1619 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001620
Antoine Pitrou653dece2009-05-04 18:32:32 +00001621#define ENCODE_DIRECT(c, directO, directWS) \
1622 ((c) < 128 && (c) > 0 && \
1623 ((utf7_category[(c)] == 0) || \
1624 (directWS && (utf7_category[(c)] == 2)) || \
1625 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001628 Py_ssize_t size,
1629 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001631 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1632}
1633
Antoine Pitrou653dece2009-05-04 18:32:32 +00001634/* The decoder. The only state we preserve is our read position,
1635 * i.e. how many characters we have consumed. So if we end in the
1636 * middle of a shift sequence we have to back off the read position
1637 * and the output to the beginning of the sequence, otherwise we lose
1638 * all the shift state (seen bits, number of bits seen, high
1639 * surrogate). */
1640
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001641PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001642 Py_ssize_t size,
1643 const char *errors,
1644 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001645{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001647 Py_ssize_t startinpos;
1648 Py_ssize_t endinpos;
1649 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 const char *e;
1651 PyUnicodeObject *unicode;
1652 Py_UNICODE *p;
1653 const char *errmsg = "";
1654 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001655 Py_UNICODE *shiftOutStart;
1656 unsigned int base64bits = 0;
1657 unsigned long base64buffer = 0;
1658 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659 PyObject *errorHandler = NULL;
1660 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001661
1662 unicode = _PyUnicode_New(size);
1663 if (!unicode)
1664 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001665 if (size == 0) {
1666 if (consumed)
1667 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001668 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001669 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670
1671 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001672 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673 e = s + size;
1674
1675 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001676 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677
Antoine Pitrou653dece2009-05-04 18:32:32 +00001678 if (inShift) { /* in a base-64 section */
1679 if (IS_BASE64(ch)) { /* consume a base-64 character */
1680 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1681 base64bits += 6;
1682 s++;
1683 if (base64bits >= 16) {
1684 /* we have enough bits for a UTF-16 value */
1685 Py_UNICODE outCh = (Py_UNICODE)
1686 (base64buffer >> (base64bits-16));
1687 base64bits -= 16;
1688 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001689 assert(outCh <= 0xffff);
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 if (surrogate) {
1691 /* expecting a second surrogate */
1692 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1693#ifdef Py_UNICODE_WIDE
1694 *p++ = (((surrogate & 0x3FF)<<10)
1695 | (outCh & 0x3FF)) + 0x10000;
1696#else
1697 *p++ = surrogate;
1698 *p++ = outCh;
1699#endif
1700 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001701 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001702 }
1703 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001704 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001706 }
1707 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001708 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001709 /* first surrogate */
1710 surrogate = outCh;
1711 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001712 else {
1713 *p++ = outCh;
1714 }
1715 }
1716 }
1717 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001718 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001719 if (base64bits > 0) { /* left-over bits */
1720 if (base64bits >= 6) {
1721 /* We've seen at least one base-64 character */
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001722 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001723 errmsg = "partial character in shift sequence";
1724 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001725 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001726 else {
1727 /* Some bits remain; they should be zero */
1728 if (base64buffer != 0) {
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001729 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001730 errmsg = "non-zero padding bits in shift sequence";
1731 goto utf7Error;
1732 }
1733 }
1734 }
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001735 if (surrogate && DECODE_DIRECT(ch))
1736 *p++ = surrogate;
1737 surrogate = 0;
1738 if (ch == '-') {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001739 /* '-' is absorbed; other terminating
1740 characters are preserved */
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001741 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001743 }
1744 }
1745 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001747 s++; /* consume '+' */
1748 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 s++;
1750 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001751 }
1752 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001753 inShift = 1;
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001754 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001755 shiftOutStart = p;
1756 base64bits = 0;
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001757 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001758 }
1759 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001760 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001761 *p++ = ch;
1762 s++;
1763 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001764 else {
1765 startinpos = s-starts;
1766 s++;
1767 errmsg = "unexpected special character";
1768 goto utf7Error;
1769 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001770 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001771utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001772 outpos = p-PyUnicode_AS_UNICODE(unicode);
1773 endinpos = s-starts;
1774 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001775 errors, &errorHandler,
1776 "utf7", errmsg,
1777 starts, size, &startinpos, &endinpos, &exc, &s,
1778 &unicode, &outpos, &p))
1779 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001780 }
1781
Antoine Pitrou653dece2009-05-04 18:32:32 +00001782 /* end of string */
1783
1784 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1785 /* if we're in an inconsistent state, that's an error */
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001786 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001787 if (surrogate ||
1788 (base64bits >= 6) ||
1789 (base64bits > 0 && base64buffer != 0)) {
1790 outpos = p-PyUnicode_AS_UNICODE(unicode);
1791 endinpos = size;
1792 if (unicode_decode_call_errorhandler(
1793 errors, &errorHandler,
1794 "utf7", "unterminated shift sequence",
1795 starts, size, &startinpos, &endinpos, &exc, &s,
1796 &unicode, &outpos, &p))
1797 goto onError;
1798 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001799 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001800
1801 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001802 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001803 if (inShift) {
1804 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001805 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001806 }
1807 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001808 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001809 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001810 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001811
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001812 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001813 goto onError;
1814
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 Py_XDECREF(errorHandler);
1816 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001817 return (PyObject *)unicode;
1818
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001819 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001820 Py_XDECREF(errorHandler);
1821 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001822 Py_DECREF(unicode);
1823 return NULL;
1824}
1825
1826
1827PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001828 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001829 int base64SetO,
1830 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001831 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001832{
1833 PyObject *v;
1834 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001835 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001836 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001837 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001838 unsigned int base64bits = 0;
1839 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001840 char * out;
1841 char * start;
1842
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001843 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001844 return PyErr_NoMemory();
1845
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001846 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001847 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001848
Antoine Pitrou653dece2009-05-04 18:32:32 +00001849 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850 if (v == NULL)
1851 return NULL;
1852
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001853 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001854 for (;i < size; ++i) {
1855 Py_UNICODE ch = s[i];
1856
Antoine Pitrou653dece2009-05-04 18:32:32 +00001857 if (inShift) {
1858 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1859 /* shifting out */
1860 if (base64bits) { /* output remaining bits */
1861 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1862 base64buffer = 0;
1863 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001864 }
1865 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001866 /* Characters not in the BASE64 set implicitly unshift the sequence
1867 so no '-' is required, except if the character is itself a '-' */
1868 if (IS_BASE64(ch) || ch == '-') {
1869 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001870 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001871 *out++ = (char) ch;
1872 }
1873 else {
1874 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001875 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001876 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001877 else { /* not in a shift sequence */
1878 if (ch == '+') {
1879 *out++ = '+';
1880 *out++ = '-';
1881 }
1882 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1883 *out++ = (char) ch;
1884 }
1885 else {
1886 *out++ = '+';
1887 inShift = 1;
1888 goto encode_char;
1889 }
1890 }
1891 continue;
1892encode_char:
1893#ifdef Py_UNICODE_WIDE
1894 if (ch >= 0x10000) {
1895 /* code first surrogate */
1896 base64bits += 16;
1897 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1898 while (base64bits >= 6) {
1899 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1900 base64bits -= 6;
1901 }
1902 /* prepare second surrogate */
1903 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1904 }
1905#endif
1906 base64bits += 16;
1907 base64buffer = (base64buffer << 16) | ch;
1908 while (base64bits >= 6) {
1909 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1910 base64bits -= 6;
1911 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001912 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001913 if (base64bits)
1914 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1915 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001916 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001917
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001918 if (_PyString_Resize(&v, out - start))
1919 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001920 return v;
1921}
1922
Antoine Pitrou653dece2009-05-04 18:32:32 +00001923#undef IS_BASE64
1924#undef FROM_BASE64
1925#undef TO_BASE64
1926#undef DECODE_DIRECT
1927#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001928
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929/* --- UTF-8 Codec -------------------------------------------------------- */
1930
Tim Petersced69f82003-09-16 20:30:58 +00001931static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001933 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1934 illegal prefix. See RFC 3629 for details */
1935 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1936 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001937 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1939 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1941 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1943 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001944 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1945 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001946 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1947 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1948 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1949 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1950 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951};
1952
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001954 Py_ssize_t size,
1955 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956{
Walter Dörwald69652032004-09-07 20:24:22 +00001957 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1958}
1959
1960PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001961 Py_ssize_t size,
1962 const char *errors,
1963 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001964{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001965 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001967 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001968 Py_ssize_t startinpos;
1969 Py_ssize_t endinpos;
1970 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 const char *e;
1972 PyUnicodeObject *unicode;
1973 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001974 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001975 PyObject *errorHandler = NULL;
1976 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
1978 /* Note: size will always be longer than the resulting Unicode
1979 character count */
1980 unicode = _PyUnicode_New(size);
1981 if (!unicode)
1982 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001983 if (size == 0) {
1984 if (consumed)
1985 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988
1989 /* Unpack UTF-8 encoded data */
1990 p = unicode->str;
1991 e = s + size;
1992
1993 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001994 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995
1996 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001997 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998 s++;
1999 continue;
2000 }
2001
2002 n = utf8_code_length[ch];
2003
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002004 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002005 if (consumed)
2006 break;
2007 else {
2008 errmsg = "unexpected end of data";
2009 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002010 endinpos = startinpos+1;
2011 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2012 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002013 goto utf8Error;
2014 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016
2017 switch (n) {
2018
2019 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002020 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002021 startinpos = s-starts;
2022 endinpos = startinpos+1;
2023 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024
2025 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002026 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002027 startinpos = s-starts;
2028 endinpos = startinpos+1;
2029 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030
2031 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002032 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00002033 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002034 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002035 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002036 goto utf8Error;
2037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002039 assert ((ch > 0x007F) && (ch <= 0x07FF));
2040 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 break;
2042
2043 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002044 /* XXX: surrogates shouldn't be valid UTF-8!
2045 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2046 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2047 Uncomment the 2 lines below to make them invalid,
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002048 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00002049 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002050 (s[2] & 0xc0) != 0x80 ||
2051 ((unsigned char)s[0] == 0xE0 &&
2052 (unsigned char)s[1] < 0xA0)/* ||
2053 ((unsigned char)s[0] == 0xED &&
2054 (unsigned char)s[1] > 0x9F)*/) {
2055 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002056 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002057 endinpos = startinpos + 1;
2058
2059 /* if s[1] first two bits are 1 and 0, then the invalid
2060 continuation byte is s[2], so increment endinpos by 1,
2061 if not, s[1] is invalid and endinpos doesn't need to
2062 be incremented. */
2063 if ((s[1] & 0xC0) == 0x80)
2064 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002065 goto utf8Error;
2066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002068 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2069 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002070 break;
2071
2072 case 4:
2073 if ((s[1] & 0xc0) != 0x80 ||
2074 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002075 (s[3] & 0xc0) != 0x80 ||
2076 ((unsigned char)s[0] == 0xF0 &&
2077 (unsigned char)s[1] < 0x90) ||
2078 ((unsigned char)s[0] == 0xF4 &&
2079 (unsigned char)s[1] > 0x8F)) {
2080 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002081 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002082 endinpos = startinpos + 1;
2083 if ((s[1] & 0xC0) == 0x80) {
2084 endinpos++;
2085 if ((s[2] & 0xC0) == 0x80)
2086 endinpos++;
2087 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002088 goto utf8Error;
2089 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2092 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2093
Fredrik Lundh8f455852001-06-27 18:59:43 +00002094#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002095 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002096#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002097 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002098
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002099 /* translate from 10000..10FFFF to 0..FFFF */
2100 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002101
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002102 /* high surrogate = top 10 bits added to D800 */
2103 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002104
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002105 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002106 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002107#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 }
2110 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002111 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002112
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002113 utf8Error:
2114 outpos = p-PyUnicode_AS_UNICODE(unicode);
2115 if (unicode_decode_call_errorhandler(
2116 errors, &errorHandler,
2117 "utf8", errmsg,
2118 starts, size, &startinpos, &endinpos, &exc, &s,
2119 &unicode, &outpos, &p))
2120 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 }
Walter Dörwald69652032004-09-07 20:24:22 +00002122 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002123 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124
2125 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002126 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 goto onError;
2128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002129 Py_XDECREF(errorHandler);
2130 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 return (PyObject *)unicode;
2132
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002133 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002134 Py_XDECREF(errorHandler);
2135 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 Py_DECREF(unicode);
2137 return NULL;
2138}
2139
Tim Peters602f7402002-04-27 18:03:26 +00002140/* Allocation strategy: if the string is short, convert into a stack buffer
2141 and allocate exactly as much space needed at the end. Else allocate the
2142 maximum possible needed (4 result bytes per Unicode character), and return
2143 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002144*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002145PyObject *
2146PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002147 Py_ssize_t size,
2148 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002149{
Tim Peters602f7402002-04-27 18:03:26 +00002150#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002151
Martin v. Löwis18e16552006-02-15 17:27:45 +00002152 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002153 PyObject *v; /* result string object */
2154 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002155 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002156 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002157 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002158
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(s != NULL);
2160 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161
Tim Peters602f7402002-04-27 18:03:26 +00002162 if (size <= MAX_SHORT_UNICHARS) {
2163 /* Write into the stack buffer; nallocated can't overflow.
2164 * At the end, we'll allocate exactly as much heap space as it
2165 * turns out we need.
2166 */
2167 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2168 v = NULL; /* will allocate after we're done */
2169 p = stackbuf;
2170 }
2171 else {
2172 /* Overallocate on the heap, and give the excess back at the end. */
2173 nallocated = size * 4;
2174 if (nallocated / 4 != size) /* overflow! */
2175 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002176 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002177 if (v == NULL)
2178 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002179 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002180 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002181
Tim Peters602f7402002-04-27 18:03:26 +00002182 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002183 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002184
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002185 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002186 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002187 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002188
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002190 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002191 *p++ = (char)(0xc0 | (ch >> 6));
2192 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002193 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002194 else {
Tim Peters602f7402002-04-27 18:03:26 +00002195 /* Encode UCS2 Unicode ordinals */
2196 if (ch < 0x10000) {
2197 /* Special case: check for high surrogate */
2198 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2199 Py_UCS4 ch2 = s[i];
2200 /* Check for low surrogate and combine the two to
2201 form a UCS4 value */
2202 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002203 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002204 i++;
2205 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002206 }
Tim Peters602f7402002-04-27 18:03:26 +00002207 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002208 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002209 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002210 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2211 *p++ = (char)(0x80 | (ch & 0x3f));
2212 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002213 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002214 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002215 /* Encode UCS4 Unicode ordinals */
2216 *p++ = (char)(0xf0 | (ch >> 18));
2217 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2218 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2219 *p++ = (char)(0x80 | (ch & 0x3f));
2220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002222
Tim Peters602f7402002-04-27 18:03:26 +00002223 if (v == NULL) {
2224 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002225 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002226 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002227 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002228 }
2229 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002230 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002231 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002232 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002233 if (_PyString_Resize(&v, nneeded))
2234 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002237
Tim Peters602f7402002-04-27 18:03:26 +00002238#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239}
2240
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2242{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 if (!PyUnicode_Check(unicode)) {
2244 PyErr_BadArgument();
2245 return NULL;
2246 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002247 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002248 PyUnicode_GET_SIZE(unicode),
2249 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250}
2251
Walter Dörwald6e390802007-08-17 16:41:28 +00002252/* --- UTF-32 Codec ------------------------------------------------------- */
2253
2254PyObject *
2255PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002256 Py_ssize_t size,
2257 const char *errors,
2258 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002259{
2260 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2261}
2262
2263PyObject *
2264PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002265 Py_ssize_t size,
2266 const char *errors,
2267 int *byteorder,
2268 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002269{
2270 const char *starts = s;
2271 Py_ssize_t startinpos;
2272 Py_ssize_t endinpos;
2273 Py_ssize_t outpos;
2274 PyUnicodeObject *unicode;
2275 Py_UNICODE *p;
2276#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002277 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002278 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002279#else
2280 const int pairs = 0;
2281#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002282 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002283 int bo = 0; /* assume native ordering by default */
2284 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002285 /* Offsets from q for retrieving bytes in the right order. */
2286#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2287 int iorder[] = {0, 1, 2, 3};
2288#else
2289 int iorder[] = {3, 2, 1, 0};
2290#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002291 PyObject *errorHandler = NULL;
2292 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002293
Walter Dörwald6e390802007-08-17 16:41:28 +00002294 q = (unsigned char *)s;
2295 e = q + size;
2296
2297 if (byteorder)
2298 bo = *byteorder;
2299
2300 /* Check for BOM marks (U+FEFF) in the input and adjust current
2301 byte order setting accordingly. In native mode, the leading BOM
2302 mark is skipped, in all other modes, it is copied to the output
2303 stream as-is (giving a ZWNBSP character). */
2304 if (bo == 0) {
2305 if (size >= 4) {
2306 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002307 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002308#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002309 if (bom == 0x0000FEFF) {
2310 q += 4;
2311 bo = -1;
2312 }
2313 else if (bom == 0xFFFE0000) {
2314 q += 4;
2315 bo = 1;
2316 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002317#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002318 if (bom == 0x0000FEFF) {
2319 q += 4;
2320 bo = 1;
2321 }
2322 else if (bom == 0xFFFE0000) {
2323 q += 4;
2324 bo = -1;
2325 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002326#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002327 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002328 }
2329
2330 if (bo == -1) {
2331 /* force LE */
2332 iorder[0] = 0;
2333 iorder[1] = 1;
2334 iorder[2] = 2;
2335 iorder[3] = 3;
2336 }
2337 else if (bo == 1) {
2338 /* force BE */
2339 iorder[0] = 3;
2340 iorder[1] = 2;
2341 iorder[2] = 1;
2342 iorder[3] = 0;
2343 }
2344
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002345 /* On narrow builds we split characters outside the BMP into two
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002346 code points => count how much extra space we need. */
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002347#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002348 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002349 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2350 pairs++;
2351#endif
2352
2353 /* This might be one to much, because of a BOM */
2354 unicode = _PyUnicode_New((size+3)/4+pairs);
2355 if (!unicode)
2356 return NULL;
2357 if (size == 0)
2358 return (PyObject *)unicode;
2359
2360 /* Unpack UTF-32 encoded data */
2361 p = unicode->str;
2362
Walter Dörwald6e390802007-08-17 16:41:28 +00002363 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002364 Py_UCS4 ch;
2365 /* remaining bytes at the end? (size should be divisible by 4) */
2366 if (e-q<4) {
2367 if (consumed)
2368 break;
2369 errmsg = "truncated data";
2370 startinpos = ((const char *)q)-starts;
2371 endinpos = ((const char *)e)-starts;
2372 goto utf32Error;
2373 /* The remaining input chars are ignored if the callback
2374 chooses to skip the input */
2375 }
2376 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2377 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002379 if (ch >= 0x110000)
2380 {
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002381 errmsg = "code point not in range(0x110000)";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002382 startinpos = ((const char *)q)-starts;
2383 endinpos = startinpos+4;
2384 goto utf32Error;
2385 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002386#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002387 if (ch >= 0x10000)
2388 {
2389 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2390 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2391 }
2392 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002393#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002394 *p++ = ch;
2395 q += 4;
2396 continue;
2397 utf32Error:
2398 outpos = p-PyUnicode_AS_UNICODE(unicode);
2399 if (unicode_decode_call_errorhandler(
2400 errors, &errorHandler,
2401 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002402 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002403 &unicode, &outpos, &p))
2404 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002405 }
2406
2407 if (byteorder)
2408 *byteorder = bo;
2409
2410 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002411 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002412
2413 /* Adjust length */
2414 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2415 goto onError;
2416
2417 Py_XDECREF(errorHandler);
2418 Py_XDECREF(exc);
2419 return (PyObject *)unicode;
2420
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002421 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002422 Py_DECREF(unicode);
2423 Py_XDECREF(errorHandler);
2424 Py_XDECREF(exc);
2425 return NULL;
2426}
2427
2428PyObject *
2429PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002430 Py_ssize_t size,
2431 const char *errors,
2432 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002433{
2434 PyObject *v;
2435 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002436 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002437#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002438 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002439#else
2440 const int pairs = 0;
2441#endif
2442 /* Offsets from p for storing byte pairs in the right order. */
2443#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2444 int iorder[] = {0, 1, 2, 3};
2445#else
2446 int iorder[] = {3, 2, 1, 0};
2447#endif
2448
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002449#define STORECHAR(CH) \
2450 do { \
2451 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2452 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2453 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2454 p[iorder[0]] = (CH) & 0xff; \
2455 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002456 } while(0)
2457
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002458 /* In narrow builds we can output surrogate pairs as one code point,
Walter Dörwald6e390802007-08-17 16:41:28 +00002459 so we need less space. */
2460#ifndef Py_UNICODE_WIDE
2461 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002462 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2463 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2464 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002465#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002466 nsize = (size - pairs + (byteorder == 0));
2467 bytesize = nsize * 4;
2468 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002469 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002470 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002471 if (v == NULL)
2472 return NULL;
2473
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002474 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002475 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002476 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002477 if (size == 0)
2478 return v;
2479
2480 if (byteorder == -1) {
2481 /* force LE */
2482 iorder[0] = 0;
2483 iorder[1] = 1;
2484 iorder[2] = 2;
2485 iorder[3] = 3;
2486 }
2487 else if (byteorder == 1) {
2488 /* force BE */
2489 iorder[0] = 3;
2490 iorder[1] = 2;
2491 iorder[2] = 1;
2492 iorder[3] = 0;
2493 }
2494
2495 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002496 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002497#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002498 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2499 Py_UCS4 ch2 = *s;
2500 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2501 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2502 s++;
2503 size--;
2504 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002505 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002506#endif
2507 STORECHAR(ch);
2508 }
2509 return v;
2510#undef STORECHAR
2511}
2512
2513PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2514{
2515 if (!PyUnicode_Check(unicode)) {
2516 PyErr_BadArgument();
2517 return NULL;
2518 }
2519 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002520 PyUnicode_GET_SIZE(unicode),
2521 NULL,
2522 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002523}
2524
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525/* --- UTF-16 Codec ------------------------------------------------------- */
2526
Tim Peters772747b2001-08-09 22:21:55 +00002527PyObject *
2528PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002529 Py_ssize_t size,
2530 const char *errors,
2531 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532{
Walter Dörwald69652032004-09-07 20:24:22 +00002533 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2534}
2535
2536PyObject *
2537PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002538 Py_ssize_t size,
2539 const char *errors,
2540 int *byteorder,
2541 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002542{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002543 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002544 Py_ssize_t startinpos;
2545 Py_ssize_t endinpos;
2546 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 PyUnicodeObject *unicode;
2548 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002549 const unsigned char *q, *e;
2550 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002551 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002552 /* Offsets from q for retrieving byte pairs in the right order. */
2553#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2554 int ihi = 1, ilo = 0;
2555#else
2556 int ihi = 0, ilo = 1;
2557#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002558 PyObject *errorHandler = NULL;
2559 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560
2561 /* Note: size will always be longer than the resulting Unicode
2562 character count */
2563 unicode = _PyUnicode_New(size);
2564 if (!unicode)
2565 return NULL;
2566 if (size == 0)
2567 return (PyObject *)unicode;
2568
2569 /* Unpack UTF-16 encoded data */
2570 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002571 q = (unsigned char *)s;
2572 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573
2574 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002575 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002577 /* Check for BOM marks (U+FEFF) in the input and adjust current
2578 byte order setting accordingly. In native mode, the leading BOM
2579 mark is skipped, in all other modes, it is copied to the output
2580 stream as-is (giving a ZWNBSP character). */
2581 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002582 if (size >= 2) {
2583 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002584#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002585 if (bom == 0xFEFF) {
2586 q += 2;
2587 bo = -1;
2588 }
2589 else if (bom == 0xFFFE) {
2590 q += 2;
2591 bo = 1;
2592 }
Tim Petersced69f82003-09-16 20:30:58 +00002593#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002594 if (bom == 0xFEFF) {
2595 q += 2;
2596 bo = 1;
2597 }
2598 else if (bom == 0xFFFE) {
2599 q += 2;
2600 bo = -1;
2601 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002602#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002603 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605
Tim Peters772747b2001-08-09 22:21:55 +00002606 if (bo == -1) {
2607 /* force LE */
2608 ihi = 1;
2609 ilo = 0;
2610 }
2611 else if (bo == 1) {
2612 /* force BE */
2613 ihi = 0;
2614 ilo = 1;
2615 }
2616
2617 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002618 Py_UNICODE ch;
2619 /* remaining bytes at the end? (size should be even) */
2620 if (e-q<2) {
2621 if (consumed)
2622 break;
2623 errmsg = "truncated data";
2624 startinpos = ((const char *)q)-starts;
2625 endinpos = ((const char *)e)-starts;
2626 goto utf16Error;
2627 /* The remaining input chars are ignored if the callback
2628 chooses to skip the input */
2629 }
2630 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002631
Benjamin Peterson857ce152009-01-31 16:29:18 +00002632 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002633
2634 if (ch < 0xD800 || ch > 0xDFFF) {
2635 *p++ = ch;
2636 continue;
2637 }
2638
2639 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002640 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002641 q -= 2;
2642 if (consumed)
2643 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002644 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002645 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002646 endinpos = ((const char *)e)-starts;
2647 goto utf16Error;
2648 }
2649 if (0xD800 <= ch && ch <= 0xDBFF) {
2650 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2651 q += 2;
2652 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002653#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002654 *p++ = ch;
2655 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002656#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002657 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002658#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002659 continue;
2660 }
2661 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002662 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002663 startinpos = (((const char *)q)-4)-starts;
2664 endinpos = startinpos+2;
2665 goto utf16Error;
2666 }
2667
Benjamin Peterson857ce152009-01-31 16:29:18 +00002668 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002669 errmsg = "illegal encoding";
2670 startinpos = (((const char *)q)-2)-starts;
2671 endinpos = startinpos+2;
2672 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002673
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002674 utf16Error:
2675 outpos = p-PyUnicode_AS_UNICODE(unicode);
2676 if (unicode_decode_call_errorhandler(
2677 errors, &errorHandler,
2678 "utf16", errmsg,
2679 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2680 &unicode, &outpos, &p))
2681 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 }
2683
2684 if (byteorder)
2685 *byteorder = bo;
2686
Walter Dörwald69652032004-09-07 20:24:22 +00002687 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002688 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002689
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002691 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 goto onError;
2693
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 Py_XDECREF(errorHandler);
2695 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 return (PyObject *)unicode;
2697
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002698 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002700 Py_XDECREF(errorHandler);
2701 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 return NULL;
2703}
2704
Tim Peters772747b2001-08-09 22:21:55 +00002705PyObject *
2706PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002707 Py_ssize_t size,
2708 const char *errors,
2709 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710{
2711 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002712 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002713 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002714#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002715 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002716#else
2717 const int pairs = 0;
2718#endif
Tim Peters772747b2001-08-09 22:21:55 +00002719 /* Offsets from p for storing byte pairs in the right order. */
2720#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2721 int ihi = 1, ilo = 0;
2722#else
2723 int ihi = 0, ilo = 1;
2724#endif
2725
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002726#define STORECHAR(CH) \
2727 do { \
2728 p[ihi] = ((CH) >> 8) & 0xff; \
2729 p[ilo] = (CH) & 0xff; \
2730 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002731 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002733#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002734 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002735 if (s[i] >= 0x10000)
2736 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002737#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002738 /* 2 * (size + pairs + (byteorder == 0)) */
2739 if (size > PY_SSIZE_T_MAX ||
2740 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002741 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002742 nsize = size + pairs + (byteorder == 0);
2743 bytesize = nsize * 2;
2744 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002745 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002746 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 if (v == NULL)
2748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002750 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002752 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002753 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002754 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002755
2756 if (byteorder == -1) {
2757 /* force LE */
2758 ihi = 1;
2759 ilo = 0;
2760 }
2761 else if (byteorder == 1) {
2762 /* force BE */
2763 ihi = 0;
2764 ilo = 1;
2765 }
2766
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002767 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002768 Py_UNICODE ch = *s++;
2769 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002770#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002771 if (ch >= 0x10000) {
2772 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2773 ch = 0xD800 | ((ch-0x10000) >> 10);
2774 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002775#endif
Tim Peters772747b2001-08-09 22:21:55 +00002776 STORECHAR(ch);
2777 if (ch2)
2778 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002781#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782}
2783
2784PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2785{
2786 if (!PyUnicode_Check(unicode)) {
2787 PyErr_BadArgument();
2788 return NULL;
2789 }
2790 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002791 PyUnicode_GET_SIZE(unicode),
2792 NULL,
2793 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794}
2795
2796/* --- Unicode Escape Codec ----------------------------------------------- */
2797
Fredrik Lundh06d12682001-01-24 07:59:11 +00002798static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002799
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002801 Py_ssize_t size,
2802 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002804 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002805 Py_ssize_t startinpos;
2806 Py_ssize_t endinpos;
2807 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002809 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002811 char* message;
2812 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002813 PyObject *errorHandler = NULL;
2814 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002815
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 /* Escaped strings will always be longer than the resulting
2817 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002818 length after conversion to the true value.
2819 (but if the error callback returns a long replacement string
2820 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 v = _PyUnicode_New(size);
2822 if (v == NULL)
2823 goto onError;
2824 if (size == 0)
2825 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002826
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002829
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 while (s < end) {
2831 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002832 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834
2835 /* Non-escape characters are interpreted as Unicode ordinals */
2836 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002837 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 continue;
2839 }
2840
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002841 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842 /* \ - Escapes */
2843 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002844 c = *s++;
2845 if (s > end)
2846 c = '\0'; /* Invalid after \ */
2847 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002849 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 case '\n': break;
2851 case '\\': *p++ = '\\'; break;
2852 case '\'': *p++ = '\''; break;
2853 case '\"': *p++ = '\"'; break;
2854 case 'b': *p++ = '\b'; break;
2855 case 'f': *p++ = '\014'; break; /* FF */
2856 case 't': *p++ = '\t'; break;
2857 case 'n': *p++ = '\n'; break;
2858 case 'r': *p++ = '\r'; break;
2859 case 'v': *p++ = '\013'; break; /* VT */
2860 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2861
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002862 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 case '0': case '1': case '2': case '3':
2864 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002865 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002866 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002867 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002868 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002869 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002871 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872 break;
2873
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002874 /* hex escapes */
2875 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002877 digits = 2;
2878 message = "truncated \\xXX escape";
2879 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002881 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002883 digits = 4;
2884 message = "truncated \\uXXXX escape";
2885 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002887 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002888 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002889 digits = 8;
2890 message = "truncated \\UXXXXXXXX escape";
2891 hexescape:
2892 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002893 if (end - s < digits) {
2894 /* count only hex digits */
2895 for (; s < end; ++s) {
2896 c = (unsigned char)*s;
2897 if (!Py_ISXDIGIT(c))
2898 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002899 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002900 goto error;
2901 }
2902 for (; digits--; ++s) {
2903 c = (unsigned char)*s;
2904 if (!Py_ISXDIGIT(c))
2905 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002906 chr = (chr<<4) & ~0xF;
2907 if (c >= '0' && c <= '9')
2908 chr += c - '0';
2909 else if (c >= 'a' && c <= 'f')
2910 chr += 10 + c - 'a';
2911 else
2912 chr += 10 + c - 'A';
2913 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002914 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002915 /* _decoding_error will have already written into the
2916 target buffer. */
2917 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002918 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002919 /* when we get here, chr is a 32-bit unicode character */
2920 if (chr <= 0xffff)
2921 /* UCS-2 character */
2922 *p++ = (Py_UNICODE) chr;
2923 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002924 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002925 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002926#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002927 *p++ = chr;
2928#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002929 chr -= 0x10000L;
2930 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002931 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002932#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002933 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002934 message = "illegal Unicode character";
2935 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002936 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002937 break;
2938
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002939 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002940 case 'N':
2941 message = "malformed \\N character escape";
2942 if (ucnhash_CAPI == NULL) {
2943 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002944 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002945 if (ucnhash_CAPI == NULL)
2946 goto ucnhashError;
2947 }
2948 if (*s == '{') {
2949 const char *start = s+1;
2950 /* look for the closing brace */
2951 while (*s != '}' && s < end)
2952 s++;
2953 if (s > start && s < end && *s == '}') {
2954 /* found a name. look it up in the unicode database */
2955 message = "unknown Unicode character name";
2956 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002957 if (s - start - 1 <= INT_MAX &&
2958 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002959 goto store;
2960 }
2961 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002962 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002963
2964 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002965 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002966 message = "\\ at end of string";
2967 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002968 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002969 }
2970 else {
2971 *p++ = '\\';
2972 *p++ = (unsigned char)s[-1];
2973 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002974 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002976 continue;
2977
2978 error:
2979 endinpos = s-starts;
2980 outpos = p-PyUnicode_AS_UNICODE(v);
2981 if (unicode_decode_call_errorhandler(
2982 errors, &errorHandler,
2983 "unicodeescape", message,
2984 starts, size, &startinpos, &endinpos, &exc, &s,
2985 &v, &outpos, &p))
2986 goto onError;
2987 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002989 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002990 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002991 Py_XDECREF(errorHandler);
2992 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002994
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002995 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002996 PyErr_SetString(
2997 PyExc_UnicodeError,
2998 "\\N escapes not supported (can't load unicodedata module)"
2999 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003000 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003001 Py_XDECREF(errorHandler);
3002 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003003 return NULL;
3004
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003005 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003007 Py_XDECREF(errorHandler);
3008 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009 return NULL;
3010}
3011
3012/* Return a Unicode-Escape string version of the Unicode object.
3013
3014 If quotes is true, the string is enclosed in u"" or u'' quotes as
3015 appropriate.
3016
3017*/
3018
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00003019Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003020 Py_ssize_t size,
3021 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00003022{
3023 /* like wcschr, but doesn't stop at NULL characters */
3024
3025 while (size-- > 0) {
3026 if (*s == ch)
3027 return s;
3028 s++;
3029 }
3030
3031 return NULL;
3032}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003033
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034static
3035PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003036 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 int quotes)
3038{
3039 PyObject *repr;
3040 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003042 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00003043#ifdef Py_UNICODE_WIDE
3044 const Py_ssize_t expandsize = 10;
3045#else
3046 const Py_ssize_t expandsize = 6;
3047#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048
Neal Norwitz17753ec2006-08-21 22:21:19 +00003049 /* XXX(nnorwitz): rather than over-allocating, it would be
3050 better to choose a different scheme. Perhaps scan the
3051 first N-chars of the string and allocate based on that size.
3052 */
3053 /* Initial allocation is based on the longest-possible unichr
3054 escape.
3055
3056 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3057 unichr, so in this case it's the longest unichr escape. In
3058 narrow (UTF-16) builds this is five chars per source unichr
3059 since there are two unichrs in the surrogate pair, so in narrow
3060 (UTF-16) builds it's not the longest unichr escape.
3061
3062 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3063 so in the narrow (UTF-16) build case it's the longest unichr
3064 escape.
3065 */
3066
Neal Norwitze7d8be82008-07-31 17:17:14 +00003067 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003068 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003069
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003070 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003071 2
3072 + expandsize*size
3073 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 if (repr == NULL)
3075 return NULL;
3076
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003077 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078
3079 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003081 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 !findchar(s, size, '"')) ? '"' : '\'';
3083 }
3084 while (size-- > 0) {
3085 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003086
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003087 /* Escape quotes and backslashes */
3088 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003089 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 *p++ = '\\';
3091 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003092 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003093 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003094
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003095#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003096 /* Map 21-bit characters to '\U00xxxxxx' */
3097 else if (ch >= 0x10000) {
3098 *p++ = '\\';
3099 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003100 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3101 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3102 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3103 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3104 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3105 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3106 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003107 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003108 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003109 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003110#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003111 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3112 else if (ch >= 0xD800 && ch < 0xDC00) {
3113 Py_UNICODE ch2;
3114 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003115
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003116 ch2 = *s++;
3117 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003118 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003119 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3120 *p++ = '\\';
3121 *p++ = 'U';
3122 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3123 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3124 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3125 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3126 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3127 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3128 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3129 *p++ = hexdigit[ucs & 0x0000000F];
3130 continue;
3131 }
3132 /* Fall through: isolated surrogates are copied as-is */
3133 s--;
3134 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003135 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003136#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003137
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003139 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140 *p++ = '\\';
3141 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003142 *p++ = hexdigit[(ch >> 12) & 0x000F];
3143 *p++ = hexdigit[(ch >> 8) & 0x000F];
3144 *p++ = hexdigit[(ch >> 4) & 0x000F];
3145 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003147
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003148 /* Map special whitespace to '\t', \n', '\r' */
3149 else if (ch == '\t') {
3150 *p++ = '\\';
3151 *p++ = 't';
3152 }
3153 else if (ch == '\n') {
3154 *p++ = '\\';
3155 *p++ = 'n';
3156 }
3157 else if (ch == '\r') {
3158 *p++ = '\\';
3159 *p++ = 'r';
3160 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003161
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003162 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003163 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003165 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003166 *p++ = hexdigit[(ch >> 4) & 0x000F];
3167 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003168 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003169
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 /* Copy everything else as-is */
3171 else
3172 *p++ = (char) ch;
3173 }
3174 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003175 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176
3177 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003178 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3179 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180 return repr;
3181}
3182
3183PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003184 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185{
3186 return unicodeescape_string(s, size, 0);
3187}
3188
3189PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3190{
3191 if (!PyUnicode_Check(unicode)) {
3192 PyErr_BadArgument();
3193 return NULL;
3194 }
3195 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003196 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197}
3198
3199/* --- Raw Unicode Escape Codec ------------------------------------------- */
3200
3201PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003202 Py_ssize_t size,
3203 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003205 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003206 Py_ssize_t startinpos;
3207 Py_ssize_t endinpos;
3208 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003210 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 const char *end;
3212 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003213 PyObject *errorHandler = NULL;
3214 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003215
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 /* Escaped strings will always be longer than the resulting
3217 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 length after conversion to the true value. (But decoding error
3219 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 v = _PyUnicode_New(size);
3221 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003222 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003224 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003225 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 end = s + size;
3227 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003228 unsigned char c;
3229 Py_UCS4 x;
3230 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003231 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003233 /* Non-escape characters are interpreted as Unicode ordinals */
3234 if (*s != '\\') {
3235 *p++ = (unsigned char)*s++;
3236 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003237 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003238 startinpos = s-starts;
3239
3240 /* \u-escapes are only interpreted iff the number of leading
3241 backslashes if odd */
3242 bs = s;
3243 for (;s < end;) {
3244 if (*s != '\\')
3245 break;
3246 *p++ = (unsigned char)*s++;
3247 }
3248 if (((s - bs) & 1) == 0 ||
3249 s >= end ||
3250 (*s != 'u' && *s != 'U')) {
3251 continue;
3252 }
3253 p--;
3254 count = *s=='u' ? 4 : 8;
3255 s++;
3256
3257 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3258 outpos = p-PyUnicode_AS_UNICODE(v);
3259 for (x = 0, i = 0; i < count; ++i, ++s) {
3260 c = (unsigned char)*s;
3261 if (!isxdigit(c)) {
3262 endinpos = s-starts;
3263 if (unicode_decode_call_errorhandler(
3264 errors, &errorHandler,
3265 "rawunicodeescape", "truncated \\uXXXX",
3266 starts, size, &startinpos, &endinpos, &exc, &s,
3267 &v, &outpos, &p))
3268 goto onError;
3269 goto nextByte;
3270 }
3271 x = (x<<4) & ~0xF;
3272 if (c >= '0' && c <= '9')
3273 x += c - '0';
3274 else if (c >= 'a' && c <= 'f')
3275 x += 10 + c - 'a';
3276 else
3277 x += 10 + c - 'A';
3278 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003279 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003280 /* UCS-2 character */
3281 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003282 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003283 /* UCS-4 character. Either store directly, or as
3284 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003285#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003286 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003287#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003288 x -= 0x10000L;
3289 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3290 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003291#endif
3292 } else {
3293 endinpos = s-starts;
3294 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003295 if (unicode_decode_call_errorhandler(
3296 errors, &errorHandler,
3297 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003298 starts, size, &startinpos, &endinpos, &exc, &s,
3299 &v, &outpos, &p))
3300 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003301 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003302 nextByte:
3303 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003305 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003306 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003307 Py_XDECREF(errorHandler);
3308 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003310
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003311 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 Py_XDECREF(errorHandler);
3314 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 return NULL;
3316}
3317
3318PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003319 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320{
3321 PyObject *repr;
3322 char *p;
3323 char *q;
3324
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003325 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003326#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003327 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003328#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003329 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003330#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003331
Neal Norwitze7d8be82008-07-31 17:17:14 +00003332 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003333 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003334
Neal Norwitze7d8be82008-07-31 17:17:14 +00003335 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336 if (repr == NULL)
3337 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003338 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003339 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003341 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342 while (size-- > 0) {
3343 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003344#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003345 /* Map 32-bit characters to '\Uxxxxxxxx' */
3346 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003347 *p++ = '\\';
3348 *p++ = 'U';
3349 *p++ = hexdigit[(ch >> 28) & 0xf];
3350 *p++ = hexdigit[(ch >> 24) & 0xf];
3351 *p++ = hexdigit[(ch >> 20) & 0xf];
3352 *p++ = hexdigit[(ch >> 16) & 0xf];
3353 *p++ = hexdigit[(ch >> 12) & 0xf];
3354 *p++ = hexdigit[(ch >> 8) & 0xf];
3355 *p++ = hexdigit[(ch >> 4) & 0xf];
3356 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003357 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003358 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003359#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003360 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3361 if (ch >= 0xD800 && ch < 0xDC00) {
3362 Py_UNICODE ch2;
3363 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003364
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003365 ch2 = *s++;
3366 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003367 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003368 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3369 *p++ = '\\';
3370 *p++ = 'U';
3371 *p++ = hexdigit[(ucs >> 28) & 0xf];
3372 *p++ = hexdigit[(ucs >> 24) & 0xf];
3373 *p++ = hexdigit[(ucs >> 20) & 0xf];
3374 *p++ = hexdigit[(ucs >> 16) & 0xf];
3375 *p++ = hexdigit[(ucs >> 12) & 0xf];
3376 *p++ = hexdigit[(ucs >> 8) & 0xf];
3377 *p++ = hexdigit[(ucs >> 4) & 0xf];
3378 *p++ = hexdigit[ucs & 0xf];
3379 continue;
3380 }
3381 /* Fall through: isolated surrogates are copied as-is */
3382 s--;
3383 size++;
3384 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003385#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003386 /* Map 16-bit characters to '\uxxxx' */
3387 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388 *p++ = '\\';
3389 *p++ = 'u';
3390 *p++ = hexdigit[(ch >> 12) & 0xf];
3391 *p++ = hexdigit[(ch >> 8) & 0xf];
3392 *p++ = hexdigit[(ch >> 4) & 0xf];
3393 *p++ = hexdigit[ch & 15];
3394 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003395 /* Copy everything else as-is */
3396 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 *p++ = (char) ch;
3398 }
3399 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003400 if (_PyString_Resize(&repr, p - q))
3401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402 return repr;
3403}
3404
3405PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3406{
3407 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003408 PyErr_BadArgument();
3409 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 }
3411 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003412 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003413}
3414
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003415/* --- Unicode Internal Codec ------------------------------------------- */
3416
3417PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003418 Py_ssize_t size,
3419 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003420{
3421 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003422 Py_ssize_t startinpos;
3423 Py_ssize_t endinpos;
3424 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003425 PyUnicodeObject *v;
3426 Py_UNICODE *p;
3427 const char *end;
3428 const char *reason;
3429 PyObject *errorHandler = NULL;
3430 PyObject *exc = NULL;
3431
Neal Norwitzd43069c2006-01-08 01:12:10 +00003432#ifdef Py_UNICODE_WIDE
3433 Py_UNICODE unimax = PyUnicode_GetMax();
3434#endif
3435
Armin Rigo7ccbca92006-10-04 12:17:45 +00003436 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003437 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3438 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003439 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003440 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003441 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003442 p = PyUnicode_AS_UNICODE(v);
3443 end = s + size;
3444
3445 while (s < end) {
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003446 if (end-s < Py_UNICODE_SIZE) {
3447 endinpos = end-starts;
3448 reason = "truncated input";
3449 goto error;
3450 }
Neal Norwitz1004a532006-05-15 07:17:23 +00003451 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003452#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003453 /* We have to sanity check the raw data, otherwise doom looms for
3454 some malformed UCS-4 data. */
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003455 if (*p > unimax || *p < 0) {
3456 endinpos = s - starts + Py_UNICODE_SIZE;
3457 reason = "illegal code point (> 0x10FFFF)";
3458 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003459 }
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003460#endif
3461 p++;
3462 s += Py_UNICODE_SIZE;
3463 continue;
3464
3465 error:
3466 startinpos = s - starts;
3467 outpos = p - PyUnicode_AS_UNICODE(v);
3468 if (unicode_decode_call_errorhandler(
3469 errors, &errorHandler,
3470 "unicode_internal", reason,
3471 starts, size, &startinpos, &endinpos, &exc, &s,
3472 &v, &outpos, &p)) {
3473 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003474 }
3475 }
3476
Martin v. Löwis412fb672006-04-13 06:34:32 +00003477 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003478 goto onError;
3479 Py_XDECREF(errorHandler);
3480 Py_XDECREF(exc);
3481 return (PyObject *)v;
3482
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003483 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003484 Py_XDECREF(v);
3485 Py_XDECREF(errorHandler);
3486 Py_XDECREF(exc);
3487 return NULL;
3488}
3489
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490/* --- Latin-1 Codec ------------------------------------------------------ */
3491
3492PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003493 Py_ssize_t size,
3494 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495{
3496 PyUnicodeObject *v;
3497 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003498
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003500 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003501 Py_UNICODE r = *(unsigned char*)s;
3502 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003503 }
3504
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505 v = _PyUnicode_New(size);
3506 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003507 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003509 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 p = PyUnicode_AS_UNICODE(v);
3511 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003512 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003514
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003515 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 Py_XDECREF(v);
3517 return NULL;
3518}
3519
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520/* create or adjust a UnicodeEncodeError */
3521static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003522 const char *encoding,
3523 const Py_UNICODE *unicode, Py_ssize_t size,
3524 Py_ssize_t startpos, Py_ssize_t endpos,
3525 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003528 *exceptionObject = PyUnicodeEncodeError_Create(
3529 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 }
3531 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003532 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3533 goto onError;
3534 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3535 goto onError;
3536 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3537 goto onError;
3538 return;
3539 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02003540 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003541 }
3542}
3543
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544/* raises a UnicodeEncodeError */
3545static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003546 const char *encoding,
3547 const Py_UNICODE *unicode, Py_ssize_t size,
3548 Py_ssize_t startpos, Py_ssize_t endpos,
3549 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550{
3551 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003552 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003554 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555}
3556
3557/* error handling callback helper:
3558 build arguments, call the callback and check the arguments,
3559 put the result into newpos and return the replacement string, which
3560 has to be freed by the caller */
3561static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003562 PyObject **errorHandler,
3563 const char *encoding, const char *reason,
3564 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3565 Py_ssize_t startpos, Py_ssize_t endpos,
3566 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003568 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569
3570 PyObject *restuple;
3571 PyObject *resunicode;
3572
3573 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003574 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003576 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 }
3578
3579 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003580 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003582 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583
3584 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003585 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003587 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003589 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003590 Py_DECREF(restuple);
3591 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 }
3593 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003594 &resunicode, newpos)) {
3595 Py_DECREF(restuple);
3596 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 }
3598 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003599 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003600 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003601 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3602 Py_DECREF(restuple);
3603 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003604 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 Py_INCREF(resunicode);
3606 Py_DECREF(restuple);
3607 return resunicode;
3608}
3609
3610static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003611 Py_ssize_t size,
3612 const char *errors,
3613 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614{
3615 /* output object */
3616 PyObject *res;
3617 /* pointers to the beginning and end+1 of input */
3618 const Py_UNICODE *startp = p;
3619 const Py_UNICODE *endp = p + size;
3620 /* pointer to the beginning of the unencodable characters */
3621 /* const Py_UNICODE *badp = NULL; */
3622 /* pointer into the output */
3623 char *str;
3624 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003625 Py_ssize_t respos = 0;
3626 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003627 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3628 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 PyObject *errorHandler = NULL;
3630 PyObject *exc = NULL;
3631 /* the following variable is used for caching string comparisons
3632 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3633 int known_errorHandler = -1;
3634
3635 /* allocate enough for a simple encoding without
3636 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003637 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 if (res == NULL)
3639 goto onError;
3640 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003641 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003642 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 ressize = size;
3644
3645 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003646 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003648 /* can we encode this? */
3649 if (c<limit) {
3650 /* no overflow check, because we know that the space is enough */
3651 *str++ = (char)c;
3652 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003653 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003654 else {
3655 Py_ssize_t unicodepos = p-startp;
3656 Py_ssize_t requiredsize;
3657 PyObject *repunicode;
3658 Py_ssize_t repsize;
3659 Py_ssize_t newpos;
3660 Py_ssize_t respos;
3661 Py_UNICODE *uni2;
3662 /* startpos for collecting unencodable chars */
3663 const Py_UNICODE *collstart = p;
3664 const Py_UNICODE *collend = p;
3665 /* find all unecodable characters */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003666 while ((collend < endp) && ((*collend) >= limit))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003667 ++collend;
3668 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3669 if (known_errorHandler==-1) {
3670 if ((errors==NULL) || (!strcmp(errors, "strict")))
3671 known_errorHandler = 1;
3672 else if (!strcmp(errors, "replace"))
3673 known_errorHandler = 2;
3674 else if (!strcmp(errors, "ignore"))
3675 known_errorHandler = 3;
3676 else if (!strcmp(errors, "xmlcharrefreplace"))
3677 known_errorHandler = 4;
3678 else
3679 known_errorHandler = 0;
3680 }
3681 switch (known_errorHandler) {
3682 case 1: /* strict */
3683 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3684 goto onError;
3685 case 2: /* replace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003686 while (collstart++ < collend)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003687 *str++ = '?'; /* fall through */
3688 case 3: /* ignore */
3689 p = collend;
3690 break;
3691 case 4: /* xmlcharrefreplace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003692 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003693 /* determine replacement size (temporarily (mis)uses p) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003694 requiredsize = respos;
3695 for (p = collstart; p < collend;) {
Serhiy Storchakae822b032013-08-06 16:56:26 +03003696 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003697 Py_ssize_t incr;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003698 if (ch < 10)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003699 incr = 2+1+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003700 else if (ch < 100)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003701 incr = 2+2+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003702 else if (ch < 1000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003703 incr = 2+3+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003704 else if (ch < 10000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003705 incr = 2+4+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003706 else if (ch < 100000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003707 incr = 2+5+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003708 else if (ch < 1000000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003709 incr = 2+6+1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003710 else
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003711 incr = 2+7+1;
3712 if (requiredsize > PY_SSIZE_T_MAX - incr)
3713 goto overflow;
3714 requiredsize += incr;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003715 }
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003716 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3717 goto overflow;
3718 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003719 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003720 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003721 requiredsize = 2*ressize;
3722 if (_PyString_Resize(&res, requiredsize))
3723 goto onError;
3724 str = PyString_AS_STRING(res) + respos;
3725 ressize = requiredsize;
3726 }
3727 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03003728 for (p = collstart; p < collend;) {
3729 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3730 str += sprintf(str, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003731 }
3732 p = collend;
3733 break;
3734 default:
3735 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3736 encoding, reason, startp, size, &exc,
3737 collstart-startp, collend-startp, &newpos);
3738 if (repunicode == NULL)
3739 goto onError;
3740 /* need more space? (at least enough for what we have+the
3741 replacement+the rest of the string, so we won't have to
3742 check space for encodable characters) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003743 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003744 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003745 if (respos > PY_SSIZE_T_MAX - repsize)
3746 goto overflow;
3747 requiredsize = respos + repsize;
3748 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3749 goto overflow;
3750 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003751 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003752 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003753 requiredsize = 2*ressize;
3754 if (_PyString_Resize(&res, requiredsize)) {
3755 Py_DECREF(repunicode);
3756 goto onError;
3757 }
3758 str = PyString_AS_STRING(res) + respos;
3759 ressize = requiredsize;
3760 }
3761 /* check if there is anything unencodable in the replacement
3762 and copy it to the output */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003763 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003764 c = *uni2;
3765 if (c >= limit) {
3766 raise_encode_exception(&exc, encoding, startp, size,
3767 unicodepos, unicodepos+1, reason);
3768 Py_DECREF(repunicode);
3769 goto onError;
3770 }
3771 *str = (char)c;
3772 }
3773 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003774 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003775 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003776 }
3777 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 /* Resize if we allocated to much */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003779 respos = str - PyString_AS_STRING(res);
3780 if (respos < ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003781 /* If this falls res will be NULL */
3782 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 Py_XDECREF(errorHandler);
3784 Py_XDECREF(exc);
3785 return res;
3786
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003787 overflow:
3788 PyErr_SetString(PyExc_OverflowError,
3789 "encoded result is too long for a Python string");
3790
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003791 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 Py_XDECREF(res);
3793 Py_XDECREF(errorHandler);
3794 Py_XDECREF(exc);
3795 return NULL;
3796}
3797
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003799 Py_ssize_t size,
3800 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003802 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803}
3804
3805PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3806{
3807 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003808 PyErr_BadArgument();
3809 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 }
3811 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003812 PyUnicode_GET_SIZE(unicode),
3813 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814}
3815
3816/* --- 7-bit ASCII Codec -------------------------------------------------- */
3817
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003819 Py_ssize_t size,
3820 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 PyUnicodeObject *v;
3824 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003825 Py_ssize_t startinpos;
3826 Py_ssize_t endinpos;
3827 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 const char *e;
3829 PyObject *errorHandler = NULL;
3830 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003831
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003833 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003834 Py_UNICODE r = *(unsigned char*)s;
3835 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003836 }
Tim Petersced69f82003-09-16 20:30:58 +00003837
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 v = _PyUnicode_New(size);
3839 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003840 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003842 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003844 e = s + size;
3845 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003846 register unsigned char c = (unsigned char)*s;
3847 if (c < 128) {
3848 *p++ = c;
3849 ++s;
3850 }
3851 else {
3852 startinpos = s-starts;
3853 endinpos = startinpos + 1;
3854 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3855 if (unicode_decode_call_errorhandler(
3856 errors, &errorHandler,
3857 "ascii", "ordinal not in range(128)",
3858 starts, size, &startinpos, &endinpos, &exc, &s,
3859 &v, &outpos, &p))
3860 goto onError;
3861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003863 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003864 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3865 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003866 Py_XDECREF(errorHandler);
3867 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003869
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003870 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003872 Py_XDECREF(errorHandler);
3873 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874 return NULL;
3875}
3876
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003878 Py_ssize_t size,
3879 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882}
3883
3884PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3885{
3886 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003887 PyErr_BadArgument();
3888 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889 }
3890 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003891 PyUnicode_GET_SIZE(unicode),
3892 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893}
3894
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003895#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003896
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003897/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003898
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003899#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003900#define NEED_RETRY
3901#endif
3902
3903/* XXX This code is limited to "true" double-byte encodings, as
3904 a) it assumes an incomplete character consists of a single byte, and
3905 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003906 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003907
3908static int is_dbcs_lead_byte(const char *s, int offset)
3909{
3910 const char *curr = s + offset;
3911
3912 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003913 const char *prev = CharPrev(s, curr);
3914 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003915 }
3916 return 0;
3917}
3918
3919/*
3920 * Decode MBCS string into unicode object. If 'final' is set, converts
3921 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3922 */
3923static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003924 const char *s, /* MBCS string */
3925 int size, /* sizeof MBCS string */
3926 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003927{
3928 Py_UNICODE *p;
3929 Py_ssize_t n = 0;
3930 int usize = 0;
3931
3932 assert(size >= 0);
3933
3934 /* Skip trailing lead-byte unless 'final' is set */
3935 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003936 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003937
3938 /* First get the size of the result */
3939 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003940 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3941 if (usize == 0) {
3942 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3943 return -1;
3944 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003945 }
3946
3947 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003948 /* Create unicode object */
3949 *v = _PyUnicode_New(usize);
3950 if (*v == NULL)
3951 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003952 }
3953 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003954 /* Extend unicode object */
3955 n = PyUnicode_GET_SIZE(*v);
3956 if (_PyUnicode_Resize(v, n + usize) < 0)
3957 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003958 }
3959
3960 /* Do the conversion */
3961 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003962 p = PyUnicode_AS_UNICODE(*v) + n;
3963 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3964 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3965 return -1;
3966 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003967 }
3968
3969 return size;
3970}
3971
3972PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003973 Py_ssize_t size,
3974 const char *errors,
3975 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003976{
3977 PyUnicodeObject *v = NULL;
3978 int done;
3979
3980 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003981 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003982
3983#ifdef NEED_RETRY
3984 retry:
3985 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003986 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003987 else
3988#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003989 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003990
3991 if (done < 0) {
3992 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003993 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003994 }
3995
3996 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003997 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003998
3999#ifdef NEED_RETRY
4000 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004001 s += done;
4002 size -= done;
4003 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004004 }
4005#endif
4006
4007 return (PyObject *)v;
4008}
4009
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004010PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004011 Py_ssize_t size,
4012 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004013{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004014 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4015}
4016
4017/*
4018 * Convert unicode into string object (MBCS).
4019 * Returns 0 if succeed, -1 otherwise.
4020 */
4021static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004022 const Py_UNICODE *p, /* unicode */
4023 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004024{
4025 int mbcssize = 0;
4026 Py_ssize_t n = 0;
4027
4028 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004029
4030 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004031 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004032 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4033 if (mbcssize == 0) {
4034 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4035 return -1;
4036 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004037 }
4038
Martin v. Löwisd8251432006-06-14 05:21:04 +00004039 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004040 /* Create string object */
4041 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4042 if (*repr == NULL)
4043 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004044 }
4045 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004046 /* Extend string object */
4047 n = PyString_Size(*repr);
4048 if (_PyString_Resize(repr, n + mbcssize) < 0)
4049 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004050 }
4051
4052 /* Do the conversion */
4053 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004054 char *s = PyString_AS_STRING(*repr) + n;
4055 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4056 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4057 return -1;
4058 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004059 }
4060
4061 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004062}
4063
4064PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004065 Py_ssize_t size,
4066 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004067{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004068 PyObject *repr = NULL;
4069 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004070
Martin v. Löwisd8251432006-06-14 05:21:04 +00004071#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004072 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004073 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004074 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004075 else
4076#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004077 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004078
Martin v. Löwisd8251432006-06-14 05:21:04 +00004079 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004080 Py_XDECREF(repr);
4081 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004082 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004083
4084#ifdef NEED_RETRY
4085 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004086 p += INT_MAX;
4087 size -= INT_MAX;
4088 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004089 }
4090#endif
4091
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004092 return repr;
4093}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004094
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004095PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4096{
4097 if (!PyUnicode_Check(unicode)) {
4098 PyErr_BadArgument();
4099 return NULL;
4100 }
4101 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004102 PyUnicode_GET_SIZE(unicode),
4103 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004104}
4105
Martin v. Löwisd8251432006-06-14 05:21:04 +00004106#undef NEED_RETRY
4107
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004108#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004109
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110/* --- Character Mapping Codec -------------------------------------------- */
4111
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004113 Py_ssize_t size,
4114 PyObject *mapping,
4115 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004118 Py_ssize_t startinpos;
4119 Py_ssize_t endinpos;
4120 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 PyUnicodeObject *v;
4123 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004124 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 PyObject *errorHandler = NULL;
4126 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004127 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004128 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 /* Default to Latin-1 */
4131 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004132 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133
4134 v = _PyUnicode_New(size);
4135 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004136 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004138 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004140 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004141 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004142 mapstring = PyUnicode_AS_UNICODE(mapping);
4143 maplen = PyUnicode_GET_SIZE(mapping);
4144 while (s < e) {
4145 unsigned char ch = *s;
4146 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004148 if (ch < maplen)
4149 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004151 if (x == 0xfffe) {
4152 /* undefined mapping */
4153 outpos = p-PyUnicode_AS_UNICODE(v);
4154 startinpos = s-starts;
4155 endinpos = startinpos+1;
4156 if (unicode_decode_call_errorhandler(
4157 errors, &errorHandler,
4158 "charmap", "character maps to <undefined>",
4159 starts, size, &startinpos, &endinpos, &exc, &s,
4160 &v, &outpos, &p)) {
4161 goto onError;
4162 }
4163 continue;
4164 }
4165 *p++ = x;
4166 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004167 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004168 }
4169 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004170 while (s < e) {
4171 unsigned char ch = *s;
4172 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004173
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004174 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4175 w = PyInt_FromLong((long)ch);
4176 if (w == NULL)
4177 goto onError;
4178 x = PyObject_GetItem(mapping, w);
4179 Py_DECREF(w);
4180 if (x == NULL) {
4181 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4182 /* No mapping found means: mapping is undefined. */
4183 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004184 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004185 } else
4186 goto onError;
4187 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004188
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004189 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004190 if (x == Py_None)
4191 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004192 if (PyInt_Check(x)) {
4193 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004194 if (value == 0xFFFE)
4195 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004196 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004197 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004198 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004199 Py_DECREF(x);
4200 goto onError;
4201 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004202
4203#ifndef Py_UNICODE_WIDE
4204 if (value > 0xFFFF) {
4205 /* see the code for 1-n mapping below */
4206 if (extrachars < 2) {
4207 /* resize first */
4208 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4209 Py_ssize_t needed = 10 - extrachars;
4210 extrachars += needed;
4211 /* XXX overflow detection missing */
4212 if (_PyUnicode_Resize(&v,
4213 PyUnicode_GET_SIZE(v) + needed) < 0) {
4214 Py_DECREF(x);
4215 goto onError;
4216 }
4217 p = PyUnicode_AS_UNICODE(v) + oldpos;
4218 }
4219 value -= 0x10000;
4220 *p++ = 0xD800 | (value >> 10);
4221 *p++ = 0xDC00 | (value & 0x3FF);
4222 extrachars -= 2;
4223 }
4224 else
4225#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004226 *p++ = (Py_UNICODE)value;
4227 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004228 else if (PyUnicode_Check(x)) {
4229 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004230
Serhiy Storchaka95997452013-01-15 14:42:59 +02004231 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004232 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004233 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4234 if (value == 0xFFFE)
4235 goto Undefined;
4236 *p++ = value;
4237 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004238 else if (targetsize > 1) {
4239 /* 1-n mapping */
4240 if (targetsize > extrachars) {
4241 /* resize first */
4242 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4243 Py_ssize_t needed = (targetsize - extrachars) + \
4244 (targetsize << 2);
4245 extrachars += needed;
4246 /* XXX overflow detection missing */
4247 if (_PyUnicode_Resize(&v,
4248 PyUnicode_GET_SIZE(v) + needed) < 0) {
4249 Py_DECREF(x);
4250 goto onError;
4251 }
4252 p = PyUnicode_AS_UNICODE(v) + oldpos;
4253 }
4254 Py_UNICODE_COPY(p,
4255 PyUnicode_AS_UNICODE(x),
4256 targetsize);
4257 p += targetsize;
4258 extrachars -= targetsize;
4259 }
4260 /* 1-0 mapping: skip the character */
4261 }
4262 else {
4263 /* wrong return value */
4264 PyErr_SetString(PyExc_TypeError,
4265 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004266 Py_DECREF(x);
4267 goto onError;
4268 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004269 Py_DECREF(x);
4270 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004271 continue;
4272Undefined:
4273 /* undefined mapping */
4274 Py_XDECREF(x);
4275 outpos = p-PyUnicode_AS_UNICODE(v);
4276 startinpos = s-starts;
4277 endinpos = startinpos+1;
4278 if (unicode_decode_call_errorhandler(
4279 errors, &errorHandler,
4280 "charmap", "character maps to <undefined>",
4281 starts, size, &startinpos, &endinpos, &exc, &s,
4282 &v, &outpos, &p)) {
4283 goto onError;
4284 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 }
4287 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004288 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 Py_XDECREF(errorHandler);
4291 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004293
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004294 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 Py_XDECREF(errorHandler);
4296 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 Py_XDECREF(v);
4298 return NULL;
4299}
4300
Martin v. Löwis3f767792006-06-04 19:36:28 +00004301/* Charmap encoding: the lookup table */
4302
4303struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004304 PyObject_HEAD
4305 unsigned char level1[32];
4306 int count2, count3;
4307 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004308};
4309
4310static PyObject*
4311encoding_map_size(PyObject *obj, PyObject* args)
4312{
4313 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004314 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004315 128*map->count3);
4316}
4317
4318static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004319 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004320 PyDoc_STR("Return the size (in bytes) of this object") },
4321 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004322};
4323
4324static void
4325encoding_map_dealloc(PyObject* o)
4326{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004327 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004328}
4329
4330static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004331 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004332 "EncodingMap", /*tp_name*/
4333 sizeof(struct encoding_map), /*tp_basicsize*/
4334 0, /*tp_itemsize*/
4335 /* methods */
4336 encoding_map_dealloc, /*tp_dealloc*/
4337 0, /*tp_print*/
4338 0, /*tp_getattr*/
4339 0, /*tp_setattr*/
4340 0, /*tp_compare*/
4341 0, /*tp_repr*/
4342 0, /*tp_as_number*/
4343 0, /*tp_as_sequence*/
4344 0, /*tp_as_mapping*/
4345 0, /*tp_hash*/
4346 0, /*tp_call*/
4347 0, /*tp_str*/
4348 0, /*tp_getattro*/
4349 0, /*tp_setattro*/
4350 0, /*tp_as_buffer*/
4351 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4352 0, /*tp_doc*/
4353 0, /*tp_traverse*/
4354 0, /*tp_clear*/
4355 0, /*tp_richcompare*/
4356 0, /*tp_weaklistoffset*/
4357 0, /*tp_iter*/
4358 0, /*tp_iternext*/
4359 encoding_map_methods, /*tp_methods*/
4360 0, /*tp_members*/
4361 0, /*tp_getset*/
4362 0, /*tp_base*/
4363 0, /*tp_dict*/
4364 0, /*tp_descr_get*/
4365 0, /*tp_descr_set*/
4366 0, /*tp_dictoffset*/
4367 0, /*tp_init*/
4368 0, /*tp_alloc*/
4369 0, /*tp_new*/
4370 0, /*tp_free*/
4371 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004372};
4373
4374PyObject*
4375PyUnicode_BuildEncodingMap(PyObject* string)
4376{
4377 Py_UNICODE *decode;
4378 PyObject *result;
4379 struct encoding_map *mresult;
4380 int i;
4381 int need_dict = 0;
4382 unsigned char level1[32];
4383 unsigned char level2[512];
4384 unsigned char *mlevel1, *mlevel2, *mlevel3;
4385 int count2 = 0, count3 = 0;
4386
4387 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4388 PyErr_BadArgument();
4389 return NULL;
4390 }
4391 decode = PyUnicode_AS_UNICODE(string);
4392 memset(level1, 0xFF, sizeof level1);
4393 memset(level2, 0xFF, sizeof level2);
4394
4395 /* If there isn't a one-to-one mapping of NULL to \0,
4396 or if there are non-BMP characters, we need to use
4397 a mapping dictionary. */
4398 if (decode[0] != 0)
4399 need_dict = 1;
4400 for (i = 1; i < 256; i++) {
4401 int l1, l2;
4402 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004403#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004404 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004405#endif
4406 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004407 need_dict = 1;
4408 break;
4409 }
4410 if (decode[i] == 0xFFFE)
4411 /* unmapped character */
4412 continue;
4413 l1 = decode[i] >> 11;
4414 l2 = decode[i] >> 7;
4415 if (level1[l1] == 0xFF)
4416 level1[l1] = count2++;
4417 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004418 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004419 }
4420
4421 if (count2 >= 0xFF || count3 >= 0xFF)
4422 need_dict = 1;
4423
4424 if (need_dict) {
4425 PyObject *result = PyDict_New();
4426 PyObject *key, *value;
4427 if (!result)
4428 return NULL;
4429 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004430 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004431 key = PyInt_FromLong(decode[i]);
4432 value = PyInt_FromLong(i);
4433 if (!key || !value)
4434 goto failed1;
4435 if (PyDict_SetItem(result, key, value) == -1)
4436 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004437 Py_DECREF(key);
4438 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004439 }
4440 return result;
4441 failed1:
4442 Py_XDECREF(key);
4443 Py_XDECREF(value);
4444 Py_DECREF(result);
4445 return NULL;
4446 }
4447
4448 /* Create a three-level trie */
4449 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4450 16*count2 + 128*count3 - 1);
4451 if (!result)
4452 return PyErr_NoMemory();
4453 PyObject_Init(result, &EncodingMapType);
4454 mresult = (struct encoding_map*)result;
4455 mresult->count2 = count2;
4456 mresult->count3 = count3;
4457 mlevel1 = mresult->level1;
4458 mlevel2 = mresult->level23;
4459 mlevel3 = mresult->level23 + 16*count2;
4460 memcpy(mlevel1, level1, 32);
4461 memset(mlevel2, 0xFF, 16*count2);
4462 memset(mlevel3, 0, 128*count3);
4463 count3 = 0;
4464 for (i = 1; i < 256; i++) {
4465 int o1, o2, o3, i2, i3;
4466 if (decode[i] == 0xFFFE)
4467 /* unmapped character */
4468 continue;
4469 o1 = decode[i]>>11;
4470 o2 = (decode[i]>>7) & 0xF;
4471 i2 = 16*mlevel1[o1] + o2;
4472 if (mlevel2[i2] == 0xFF)
4473 mlevel2[i2] = count3++;
4474 o3 = decode[i] & 0x7F;
4475 i3 = 128*mlevel2[i2] + o3;
4476 mlevel3[i3] = i;
4477 }
4478 return result;
4479}
4480
4481static int
4482encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4483{
4484 struct encoding_map *map = (struct encoding_map*)mapping;
4485 int l1 = c>>11;
4486 int l2 = (c>>7) & 0xF;
4487 int l3 = c & 0x7F;
4488 int i;
4489
4490#ifdef Py_UNICODE_WIDE
4491 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004492 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004493 }
4494#endif
4495 if (c == 0)
4496 return 0;
4497 /* level 1*/
4498 i = map->level1[l1];
4499 if (i == 0xFF) {
4500 return -1;
4501 }
4502 /* level 2*/
4503 i = map->level23[16*i+l2];
4504 if (i == 0xFF) {
4505 return -1;
4506 }
4507 /* level 3 */
4508 i = map->level23[16*map->count2 + 128*i + l3];
4509 if (i == 0) {
4510 return -1;
4511 }
4512 return i;
4513}
4514
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515/* Lookup the character ch in the mapping. If the character
4516 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004517 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 PyObject *w = PyInt_FromLong((long)c);
4521 PyObject *x;
4522
4523 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004524 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 x = PyObject_GetItem(mapping, w);
4526 Py_DECREF(w);
4527 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004528 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4529 /* No mapping found means: mapping is undefined. */
4530 PyErr_Clear();
4531 x = Py_None;
4532 Py_INCREF(x);
4533 return x;
4534 } else
4535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004537 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004538 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004540 long value = PyInt_AS_LONG(x);
4541 if (value < 0 || value > 255) {
4542 PyErr_SetString(PyExc_TypeError,
4543 "character mapping must be in range(256)");
4544 Py_DECREF(x);
4545 return NULL;
4546 }
4547 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004549 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004550 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004552 /* wrong return value */
4553 PyErr_SetString(PyExc_TypeError,
4554 "character mapping must return integer, None or str");
4555 Py_DECREF(x);
4556 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557 }
4558}
4559
Martin v. Löwis3f767792006-06-04 19:36:28 +00004560static int
4561charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4562{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004563 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4564 /* exponentially overallocate to minimize reallocations */
4565 if (requiredsize < 2*outsize)
4566 requiredsize = 2*outsize;
4567 if (_PyString_Resize(outobj, requiredsize)) {
4568 return 0;
4569 }
4570 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004571}
4572
Benjamin Peterson857ce152009-01-31 16:29:18 +00004573typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004574 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004575}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576/* lookup the character, put the result in the output string and adjust
4577 various state variables. Reallocate the output string if not enough
4578 space is available. Return a new reference to the object that
4579 was put in the output buffer, or Py_None, if the mapping was undefined
4580 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004581 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004583charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004584 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004586 PyObject *rep;
4587 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004588 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589
Christian Heimese93237d2007-12-19 02:37:44 +00004590 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004591 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004592 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004593 if (res == -1)
4594 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004595 if (outsize<requiredsize)
4596 if (!charmapencode_resize(outobj, outpos, requiredsize))
4597 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004598 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004599 outstart[(*outpos)++] = (char)res;
4600 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004601 }
4602
4603 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004605 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004606 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004607 Py_DECREF(rep);
4608 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004609 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004610 if (PyInt_Check(rep)) {
4611 Py_ssize_t requiredsize = *outpos+1;
4612 if (outsize<requiredsize)
4613 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4614 Py_DECREF(rep);
4615 return enc_EXCEPTION;
4616 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004617 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004618 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004619 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004620 else {
4621 const char *repchars = PyString_AS_STRING(rep);
4622 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4623 Py_ssize_t requiredsize = *outpos+repsize;
4624 if (outsize<requiredsize)
4625 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4626 Py_DECREF(rep);
4627 return enc_EXCEPTION;
4628 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004629 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004630 memcpy(outstart + *outpos, repchars, repsize);
4631 *outpos += repsize;
4632 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 }
Georg Brandl9f167602006-06-04 21:46:16 +00004634 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004635 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636}
4637
4638/* handle an error in PyUnicode_EncodeCharmap
4639 Return 0 on success, -1 on error */
4640static
4641int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004642 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004644 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004645 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646{
4647 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004648 Py_ssize_t repsize;
4649 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 Py_UNICODE *uni2;
4651 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004652 Py_ssize_t collstartpos = *inpos;
4653 Py_ssize_t collendpos = *inpos+1;
4654 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 char *encoding = "charmap";
4656 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004657 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 /* find all unencodable characters */
4660 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004661 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004662 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004663 int res = encoding_map_lookup(p[collendpos], mapping);
4664 if (res != -1)
4665 break;
4666 ++collendpos;
4667 continue;
4668 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004669
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004670 rep = charmapencode_lookup(p[collendpos], mapping);
4671 if (rep==NULL)
4672 return -1;
4673 else if (rep!=Py_None) {
4674 Py_DECREF(rep);
4675 break;
4676 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004677 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004678 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 }
4680 /* cache callback name lookup
4681 * (if not done yet, i.e. it's the first error) */
4682 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004683 if ((errors==NULL) || (!strcmp(errors, "strict")))
4684 *known_errorHandler = 1;
4685 else if (!strcmp(errors, "replace"))
4686 *known_errorHandler = 2;
4687 else if (!strcmp(errors, "ignore"))
4688 *known_errorHandler = 3;
4689 else if (!strcmp(errors, "xmlcharrefreplace"))
4690 *known_errorHandler = 4;
4691 else
4692 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 }
4694 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004695 case 1: /* strict */
4696 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4697 return -1;
4698 case 2: /* replace */
4699 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004700 x = charmapencode_output('?', mapping, res, respos);
4701 if (x==enc_EXCEPTION) {
4702 return -1;
4703 }
4704 else if (x==enc_FAILED) {
4705 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4706 return -1;
4707 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004708 }
4709 /* fall through */
4710 case 3: /* ignore */
4711 *inpos = collendpos;
4712 break;
4713 case 4: /* xmlcharrefreplace */
Serhiy Storchakae822b032013-08-06 16:56:26 +03004714 /* generate replacement */
4715 for (collpos = collstartpos; collpos < collendpos;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004716 char buffer[2+29+1+1];
4717 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03004718 Py_UCS4 ch = p[collpos++];
4719#ifndef Py_UNICODE_WIDE
4720 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4721 (collpos < collendpos) &&
4722 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4723 ch = ((((ch & 0x03FF) << 10) |
4724 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4725 }
4726#endif
4727 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004728 for (cp = buffer; *cp; ++cp) {
4729 x = charmapencode_output(*cp, mapping, res, respos);
4730 if (x==enc_EXCEPTION)
4731 return -1;
4732 else if (x==enc_FAILED) {
4733 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4734 return -1;
4735 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004736 }
4737 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004738 *inpos = collendpos;
4739 break;
4740 default:
4741 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004742 encoding, reason, p, size, exceptionObject,
4743 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004744 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004745 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004746 /* generate replacement */
4747 repsize = PyUnicode_GET_SIZE(repunicode);
4748 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004749 x = charmapencode_output(*uni2, mapping, res, respos);
4750 if (x==enc_EXCEPTION) {
4751 return -1;
4752 }
4753 else if (x==enc_FAILED) {
4754 Py_DECREF(repunicode);
4755 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4756 return -1;
4757 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004758 }
4759 *inpos = newpos;
4760 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004761 }
4762 return 0;
4763}
4764
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004766 Py_ssize_t size,
4767 PyObject *mapping,
4768 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 /* output object */
4771 PyObject *res = NULL;
4772 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004773 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004775 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 PyObject *errorHandler = NULL;
4777 PyObject *exc = NULL;
4778 /* the following variable is used for caching string comparisons
4779 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4780 * 3=ignore, 4=xmlcharrefreplace */
4781 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782
4783 /* Default to Latin-1 */
4784 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004785 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004787 /* allocate enough for a simple encoding without
4788 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004789 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 if (res == NULL)
4791 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004792 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004793 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004796 /* try to encode it */
4797 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4798 if (x==enc_EXCEPTION) /* error */
4799 goto onError;
4800 if (x==enc_FAILED) { /* unencodable character */
4801 if (charmap_encoding_error(p, size, &inpos, mapping,
4802 &exc,
4803 &known_errorHandler, &errorHandler, errors,
4804 &res, &respos)) {
4805 goto onError;
4806 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004807 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004808 else
4809 /* done with this character => adjust input position */
4810 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004814 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004815 if (_PyString_Resize(&res, respos))
4816 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817 }
4818 Py_XDECREF(exc);
4819 Py_XDECREF(errorHandler);
4820 return res;
4821
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004822 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 Py_XDECREF(res);
4824 Py_XDECREF(exc);
4825 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 return NULL;
4827}
4828
4829PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004830 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831{
4832 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004833 PyErr_BadArgument();
4834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835 }
4836 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004837 PyUnicode_GET_SIZE(unicode),
4838 mapping,
4839 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840}
4841
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842/* create or adjust a UnicodeTranslateError */
4843static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004844 const Py_UNICODE *unicode, Py_ssize_t size,
4845 Py_ssize_t startpos, Py_ssize_t endpos,
4846 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004849 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004850 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 }
4852 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004853 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4854 goto onError;
4855 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4856 goto onError;
4857 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4858 goto onError;
4859 return;
4860 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02004861 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 }
4863}
4864
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865/* raises a UnicodeTranslateError */
4866static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004867 const Py_UNICODE *unicode, Py_ssize_t size,
4868 Py_ssize_t startpos, Py_ssize_t endpos,
4869 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870{
4871 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004872 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004873 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004874 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875}
4876
4877/* error handling callback helper:
4878 build arguments, call the callback and check the arguments,
4879 put the result into newpos and return the replacement string, which
4880 has to be freed by the caller */
4881static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004882 PyObject **errorHandler,
4883 const char *reason,
4884 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4885 Py_ssize_t startpos, Py_ssize_t endpos,
4886 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004888 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889
Martin v. Löwis412fb672006-04-13 06:34:32 +00004890 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891 PyObject *restuple;
4892 PyObject *resunicode;
4893
4894 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004895 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004897 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898 }
4899
4900 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004901 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004903 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904
4905 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004906 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004908 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004909 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004910 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004911 Py_DECREF(restuple);
4912 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004913 }
4914 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004915 &resunicode, &i_newpos)) {
4916 Py_DECREF(restuple);
4917 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004919 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004920 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004921 else
4922 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004923 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004924 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4925 Py_DECREF(restuple);
4926 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004927 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004928 Py_INCREF(resunicode);
4929 Py_DECREF(restuple);
4930 return resunicode;
4931}
4932
4933/* Lookup the character ch in the mapping and put the result in result,
4934 which must be decrefed by the caller.
4935 Return 0 on success, -1 on error */
4936static
4937int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4938{
4939 PyObject *w = PyInt_FromLong((long)c);
4940 PyObject *x;
4941
4942 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004943 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004944 x = PyObject_GetItem(mapping, w);
4945 Py_DECREF(w);
4946 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004947 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4948 /* No mapping found means: use 1:1 mapping. */
4949 PyErr_Clear();
4950 *result = NULL;
4951 return 0;
4952 } else
4953 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004954 }
4955 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004956 *result = x;
4957 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958 }
4959 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004960 long value = PyInt_AS_LONG(x);
4961 long max = PyUnicode_GetMax();
4962 if (value < 0 || value > max) {
4963 PyErr_Format(PyExc_TypeError,
4964 "character mapping must be in range(0x%lx)", max+1);
4965 Py_DECREF(x);
4966 return -1;
4967 }
4968 *result = x;
4969 return 0;
4970 }
4971 else if (PyUnicode_Check(x)) {
4972 *result = x;
4973 return 0;
4974 }
4975 else {
4976 /* wrong return value */
4977 PyErr_SetString(PyExc_TypeError,
4978 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004979 Py_DECREF(x);
4980 return -1;
4981 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004982}
4983/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004984 if not reallocate and adjust various state variables.
4985 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004986static
Walter Dörwald4894c302003-10-24 14:25:28 +00004987int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004988 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004990 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004991 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004992 /* remember old output position */
4993 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4994 /* exponentially overallocate to minimize reallocations */
4995 if (requiredsize < 2 * oldsize)
4996 requiredsize = 2 * oldsize;
4997 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4998 return -1;
4999 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000 }
5001 return 0;
5002}
5003/* lookup the character, put the result in the output string and adjust
5004 various state variables. Return a new reference to the object that
5005 was put in the output buffer in *result, or Py_None, if the mapping was
5006 undefined (in which case no character was written).
5007 The called must decref result.
5008 Return 0 on success, -1 on error. */
5009static
Walter Dörwald4894c302003-10-24 14:25:28 +00005010int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005011 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5012 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005013{
Walter Dörwald4894c302003-10-24 14:25:28 +00005014 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005015 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005016 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005017 /* not found => default to 1:1 mapping */
5018 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005019 }
5020 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005021 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005023 /* no overflow check, because we know that the space is enough */
5024 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025 }
5026 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005027 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5028 if (repsize==1) {
5029 /* no overflow check, because we know that the space is enough */
5030 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5031 }
5032 else if (repsize!=0) {
5033 /* more than one character */
5034 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5035 (insize - (curinp-startinp)) +
5036 repsize - 1;
5037 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5038 return -1;
5039 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5040 *outp += repsize;
5041 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005042 }
5043 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005044 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 return 0;
5046}
5047
5048PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005049 Py_ssize_t size,
5050 PyObject *mapping,
5051 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005053 /* output object */
5054 PyObject *res = NULL;
5055 /* pointers to the beginning and end+1 of input */
5056 const Py_UNICODE *startp = p;
5057 const Py_UNICODE *endp = p + size;
5058 /* pointer into the output */
5059 Py_UNICODE *str;
5060 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005061 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005062 char *reason = "character maps to <undefined>";
5063 PyObject *errorHandler = NULL;
5064 PyObject *exc = NULL;
5065 /* the following variable is used for caching string comparisons
5066 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5067 * 3=ignore, 4=xmlcharrefreplace */
5068 int known_errorHandler = -1;
5069
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005071 PyErr_BadArgument();
5072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005074
5075 /* allocate enough for a simple 1:1 translation without
5076 replacements, if we need more, we'll resize */
5077 res = PyUnicode_FromUnicode(NULL, size);
5078 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005079 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005081 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005085 /* try to encode it */
5086 PyObject *x = NULL;
5087 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5088 Py_XDECREF(x);
5089 goto onError;
5090 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005091 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005092 if (x!=Py_None) /* it worked => adjust input pointer */
5093 ++p;
5094 else { /* untranslatable character */
5095 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5096 Py_ssize_t repsize;
5097 Py_ssize_t newpos;
5098 Py_UNICODE *uni2;
5099 /* startpos for collecting untranslatable chars */
5100 const Py_UNICODE *collstart = p;
5101 const Py_UNICODE *collend = p+1;
5102 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005104 /* find all untranslatable characters */
5105 while (collend < endp) {
5106 if (charmaptranslate_lookup(*collend, mapping, &x))
5107 goto onError;
5108 Py_XDECREF(x);
5109 if (x!=Py_None)
5110 break;
5111 ++collend;
5112 }
5113 /* cache callback name lookup
5114 * (if not done yet, i.e. it's the first error) */
5115 if (known_errorHandler==-1) {
5116 if ((errors==NULL) || (!strcmp(errors, "strict")))
5117 known_errorHandler = 1;
5118 else if (!strcmp(errors, "replace"))
5119 known_errorHandler = 2;
5120 else if (!strcmp(errors, "ignore"))
5121 known_errorHandler = 3;
5122 else if (!strcmp(errors, "xmlcharrefreplace"))
5123 known_errorHandler = 4;
5124 else
5125 known_errorHandler = 0;
5126 }
5127 switch (known_errorHandler) {
5128 case 1: /* strict */
5129 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005130 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005131 case 2: /* replace */
5132 /* No need to check for space, this is a 1:1 replacement */
5133 for (coll = collstart; coll<collend; ++coll)
5134 *str++ = '?';
5135 /* fall through */
5136 case 3: /* ignore */
5137 p = collend;
5138 break;
5139 case 4: /* xmlcharrefreplace */
5140 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005141 for (p = collstart; p < collend;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005142 char buffer[2+29+1+1];
5143 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03005144 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5145 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005146 if (charmaptranslate_makespace(&res, &str,
5147 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5148 goto onError;
5149 for (cp = buffer; *cp; ++cp)
5150 *str++ = *cp;
5151 }
5152 p = collend;
5153 break;
5154 default:
5155 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5156 reason, startp, size, &exc,
5157 collstart-startp, collend-startp, &newpos);
5158 if (repunicode == NULL)
5159 goto onError;
5160 /* generate replacement */
5161 repsize = PyUnicode_GET_SIZE(repunicode);
5162 if (charmaptranslate_makespace(&res, &str,
5163 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5164 Py_DECREF(repunicode);
5165 goto onError;
5166 }
5167 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5168 *str++ = *uni2;
5169 p = startp + newpos;
5170 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005171 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005172 }
5173 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005174 /* Resize if we allocated to much */
5175 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005176 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005177 if (PyUnicode_Resize(&res, respos) < 0)
5178 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 }
5180 Py_XDECREF(exc);
5181 Py_XDECREF(errorHandler);
5182 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005184 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185 Py_XDECREF(res);
5186 Py_XDECREF(exc);
5187 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 return NULL;
5189}
5190
5191PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005192 PyObject *mapping,
5193 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194{
5195 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005196
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 str = PyUnicode_FromObject(str);
5198 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005199 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005201 PyUnicode_GET_SIZE(str),
5202 mapping,
5203 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 Py_DECREF(str);
5205 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005206
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005207 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 Py_XDECREF(str);
5209 return NULL;
5210}
Tim Petersced69f82003-09-16 20:30:58 +00005211
Guido van Rossum9e896b32000-04-05 20:11:21 +00005212/* --- Decimal Encoder ---------------------------------------------------- */
5213
5214int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005215 Py_ssize_t length,
5216 char *output,
5217 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005218{
5219 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005220 PyObject *errorHandler = NULL;
5221 PyObject *exc = NULL;
5222 const char *encoding = "decimal";
5223 const char *reason = "invalid decimal Unicode string";
5224 /* the following variable is used for caching string comparisons
5225 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5226 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005227
5228 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005229 PyErr_BadArgument();
5230 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005231 }
5232
5233 p = s;
5234 end = s + length;
5235 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005236 register Py_UNICODE ch = *p;
5237 int decimal;
5238 PyObject *repunicode;
5239 Py_ssize_t repsize;
5240 Py_ssize_t newpos;
5241 Py_UNICODE *uni2;
5242 Py_UNICODE *collstart;
5243 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005244
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005245 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005246 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005247 ++p;
5248 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005249 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005250 decimal = Py_UNICODE_TODECIMAL(ch);
5251 if (decimal >= 0) {
5252 *output++ = '0' + decimal;
5253 ++p;
5254 continue;
5255 }
5256 if (0 < ch && ch < 256) {
5257 *output++ = (char)ch;
5258 ++p;
5259 continue;
5260 }
5261 /* All other characters are considered unencodable */
5262 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005263 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005264 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005265 Py_UNICODE_ISSPACE(*collend) ||
5266 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005267 break;
5268 }
5269 /* cache callback name lookup
5270 * (if not done yet, i.e. it's the first error) */
5271 if (known_errorHandler==-1) {
5272 if ((errors==NULL) || (!strcmp(errors, "strict")))
5273 known_errorHandler = 1;
5274 else if (!strcmp(errors, "replace"))
5275 known_errorHandler = 2;
5276 else if (!strcmp(errors, "ignore"))
5277 known_errorHandler = 3;
5278 else if (!strcmp(errors, "xmlcharrefreplace"))
5279 known_errorHandler = 4;
5280 else
5281 known_errorHandler = 0;
5282 }
5283 switch (known_errorHandler) {
5284 case 1: /* strict */
5285 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5286 goto onError;
5287 case 2: /* replace */
5288 for (p = collstart; p < collend; ++p)
5289 *output++ = '?';
5290 /* fall through */
5291 case 3: /* ignore */
5292 p = collend;
5293 break;
5294 case 4: /* xmlcharrefreplace */
5295 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005296 for (p = collstart; p < collend;) {
5297 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5298 output += sprintf(output, "&#%d;", ch);
5299 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005300 p = collend;
5301 break;
5302 default:
5303 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5304 encoding, reason, s, length, &exc,
5305 collstart-s, collend-s, &newpos);
5306 if (repunicode == NULL)
5307 goto onError;
5308 /* generate replacement */
5309 repsize = PyUnicode_GET_SIZE(repunicode);
5310 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5311 Py_UNICODE ch = *uni2;
5312 if (Py_UNICODE_ISSPACE(ch))
5313 *output++ = ' ';
5314 else {
5315 decimal = Py_UNICODE_TODECIMAL(ch);
5316 if (decimal >= 0)
5317 *output++ = '0' + decimal;
5318 else if (0 < ch && ch < 256)
5319 *output++ = (char)ch;
5320 else {
5321 Py_DECREF(repunicode);
5322 raise_encode_exception(&exc, encoding,
5323 s, length, collstart-s, collend-s, reason);
5324 goto onError;
5325 }
5326 }
5327 }
5328 p = s + newpos;
5329 Py_DECREF(repunicode);
5330 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005331 }
5332 /* 0-terminate the output string */
5333 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005334 Py_XDECREF(exc);
5335 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005336 return 0;
5337
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005338 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005339 Py_XDECREF(exc);
5340 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005341 return -1;
5342}
5343
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344/* --- Helpers ------------------------------------------------------------ */
5345
Eric Smitha9f7d622008-02-17 19:46:49 +00005346#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005347#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005348
5349#include "stringlib/count.h"
5350#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005351#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005352#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005353
Fredrik Lundhc8162812006-05-26 19:33:03 +00005354/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005355#define ADJUST_INDICES(start, end, len) \
5356 if (end > len) \
5357 end = len; \
5358 else if (end < 0) { \
5359 end += len; \
5360 if (end < 0) \
5361 end = 0; \
5362 } \
5363 if (start < 0) { \
5364 start += len; \
5365 if (start < 0) \
5366 start = 0; \
5367 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005368
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005370 PyObject *substr,
5371 Py_ssize_t start,
5372 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005374 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005375 PyUnicodeObject* str_obj;
5376 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005377
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005378 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5379 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005380 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005381 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5382 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005383 Py_DECREF(str_obj);
5384 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 }
Tim Petersced69f82003-09-16 20:30:58 +00005386
Antoine Pitrou64672132010-01-13 07:55:48 +00005387 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005388 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005389 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5390 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005391 );
5392
5393 Py_DECREF(sub_obj);
5394 Py_DECREF(str_obj);
5395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 return result;
5397}
5398
Martin v. Löwis18e16552006-02-15 17:27:45 +00005399Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005400 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005401 Py_ssize_t start,
5402 Py_ssize_t end,
5403 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005405 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005406
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005407 str = PyUnicode_FromObject(str);
5408 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005409 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005410 sub = PyUnicode_FromObject(sub);
5411 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005412 Py_DECREF(str);
5413 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 }
Tim Petersced69f82003-09-16 20:30:58 +00005415
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005416 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005417 result = stringlib_find_slice(
5418 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5419 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5420 start, end
5421 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005422 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005423 result = stringlib_rfind_slice(
5424 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5425 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5426 start, end
5427 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005428
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005429 Py_DECREF(str);
5430 Py_DECREF(sub);
5431
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 return result;
5433}
5434
Tim Petersced69f82003-09-16 20:30:58 +00005435static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005437 PyUnicodeObject *substring,
5438 Py_ssize_t start,
5439 Py_ssize_t end,
5440 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 if (substring->length == 0)
5443 return 1;
5444
Antoine Pitrou64672132010-01-13 07:55:48 +00005445 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 end -= substring->length;
5447 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005448 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449
5450 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005451 if (Py_UNICODE_MATCH(self, end, substring))
5452 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 } else {
5454 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005455 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 }
5457
5458 return 0;
5459}
5460
Martin v. Löwis18e16552006-02-15 17:27:45 +00005461Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005462 PyObject *substr,
5463 Py_ssize_t start,
5464 Py_ssize_t end,
5465 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005467 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005468
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 str = PyUnicode_FromObject(str);
5470 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005471 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 substr = PyUnicode_FromObject(substr);
5473 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005474 Py_DECREF(str);
5475 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 }
Tim Petersced69f82003-09-16 20:30:58 +00005477
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005479 (PyUnicodeObject *)substr,
5480 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 Py_DECREF(str);
5482 Py_DECREF(substr);
5483 return result;
5484}
5485
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486/* Apply fixfct filter to the Unicode object self and return a
5487 reference to the modified object */
5488
Tim Petersced69f82003-09-16 20:30:58 +00005489static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005491 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492{
5493
5494 PyUnicodeObject *u;
5495
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005496 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005498 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005499
5500 Py_UNICODE_COPY(u->str, self->str, self->length);
5501
Tim Peters7a29bd52001-09-12 03:03:31 +00005502 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005503 /* fixfct should return TRUE if it modified the buffer. If
5504 FALSE, return a reference to the original buffer instead
5505 (to save space, not time) */
5506 Py_INCREF(self);
5507 Py_DECREF(u);
5508 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 }
5510 return (PyObject*) u;
5511}
5512
Tim Petersced69f82003-09-16 20:30:58 +00005513static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514int fixupper(PyUnicodeObject *self)
5515{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005516 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 Py_UNICODE *s = self->str;
5518 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005519
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005521 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005522
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005523 ch = Py_UNICODE_TOUPPER(*s);
5524 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005526 *s = ch;
5527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 s++;
5529 }
5530
5531 return status;
5532}
5533
Tim Petersced69f82003-09-16 20:30:58 +00005534static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535int fixlower(PyUnicodeObject *self)
5536{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005537 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 Py_UNICODE *s = self->str;
5539 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005540
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005542 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005543
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005544 ch = Py_UNICODE_TOLOWER(*s);
5545 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005547 *s = ch;
5548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 s++;
5550 }
5551
5552 return status;
5553}
5554
Tim Petersced69f82003-09-16 20:30:58 +00005555static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556int fixswapcase(PyUnicodeObject *self)
5557{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005558 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 Py_UNICODE *s = self->str;
5560 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005561
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 while (len-- > 0) {
5563 if (Py_UNICODE_ISUPPER(*s)) {
5564 *s = Py_UNICODE_TOLOWER(*s);
5565 status = 1;
5566 } else if (Py_UNICODE_ISLOWER(*s)) {
5567 *s = Py_UNICODE_TOUPPER(*s);
5568 status = 1;
5569 }
5570 s++;
5571 }
5572
5573 return status;
5574}
5575
Tim Petersced69f82003-09-16 20:30:58 +00005576static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577int fixcapitalize(PyUnicodeObject *self)
5578{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005579 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005580 Py_UNICODE *s = self->str;
5581 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005582
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005583 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005584 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005585 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005586 *s = Py_UNICODE_TOUPPER(*s);
5587 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005589 s++;
5590 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005591 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005592 *s = Py_UNICODE_TOLOWER(*s);
5593 status = 1;
5594 }
5595 s++;
5596 }
5597 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598}
5599
5600static
5601int fixtitle(PyUnicodeObject *self)
5602{
5603 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5604 register Py_UNICODE *e;
5605 int previous_is_cased;
5606
5607 /* Shortcut for single character strings */
5608 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005609 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5610 if (*p != ch) {
5611 *p = ch;
5612 return 1;
5613 }
5614 else
5615 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 }
Tim Petersced69f82003-09-16 20:30:58 +00005617
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 e = p + PyUnicode_GET_SIZE(self);
5619 previous_is_cased = 0;
5620 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005621 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005622
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005623 if (previous_is_cased)
5624 *p = Py_UNICODE_TOLOWER(ch);
5625 else
5626 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005627
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005628 if (Py_UNICODE_ISLOWER(ch) ||
5629 Py_UNICODE_ISUPPER(ch) ||
5630 Py_UNICODE_ISTITLE(ch))
5631 previous_is_cased = 1;
5632 else
5633 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 }
5635 return 1;
5636}
5637
Tim Peters8ce9f162004-08-27 01:49:32 +00005638PyObject *
5639PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640{
Tim Peters8ce9f162004-08-27 01:49:32 +00005641 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005642 const Py_UNICODE blank = ' ';
5643 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005644 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005645 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005646 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5647 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005648 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5649 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005650 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005651 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005652 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653
Benjamin Peterson7a91bf82014-02-15 13:02:52 -05005654 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00005655 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005656 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005657 }
5658
Tim Peters91879ab2004-08-27 22:35:44 +00005659 /* Grrrr. A codec may be invoked to convert str objects to
5660 * Unicode, and so it's possible to call back into Python code
5661 * during PyUnicode_FromObject(), and so it's possible for a sick
5662 * codec to change the size of fseq (if seq is a list). Therefore
5663 * we have to keep refetching the size -- can't assume seqlen
5664 * is invariant.
5665 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005666 seqlen = PySequence_Fast_GET_SIZE(fseq);
5667 /* If empty sequence, return u"". */
5668 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005669 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5670 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005671 }
5672 /* If singleton sequence with an exact Unicode, return that. */
5673 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005674 item = PySequence_Fast_GET_ITEM(fseq, 0);
5675 if (PyUnicode_CheckExact(item)) {
5676 Py_INCREF(item);
5677 res = (PyUnicodeObject *)item;
5678 goto Done;
5679 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005680 }
5681
Tim Peters05eba1f2004-08-27 21:32:02 +00005682 /* At least two items to join, or one that isn't exact Unicode. */
5683 if (seqlen > 1) {
5684 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005685 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005686 sep = &blank;
5687 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005688 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005689 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005690 internal_separator = PyUnicode_FromObject(separator);
5691 if (internal_separator == NULL)
5692 goto onError;
5693 sep = PyUnicode_AS_UNICODE(internal_separator);
5694 seplen = PyUnicode_GET_SIZE(internal_separator);
5695 /* In case PyUnicode_FromObject() mutated seq. */
5696 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005697 }
5698 }
5699
5700 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005701 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005702 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005703 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005704 res_p = PyUnicode_AS_UNICODE(res);
5705 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005706
Tim Peters05eba1f2004-08-27 21:32:02 +00005707 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005708 Py_ssize_t itemlen;
5709 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005710
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005711 item = PySequence_Fast_GET_ITEM(fseq, i);
5712 /* Convert item to Unicode. */
5713 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5714 PyErr_Format(PyExc_TypeError,
5715 "sequence item %zd: expected string or Unicode,"
5716 " %.80s found",
5717 i, Py_TYPE(item)->tp_name);
5718 goto onError;
5719 }
5720 item = PyUnicode_FromObject(item);
5721 if (item == NULL)
5722 goto onError;
5723 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005724
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005725 /* In case PyUnicode_FromObject() mutated seq. */
5726 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005727
Tim Peters8ce9f162004-08-27 01:49:32 +00005728 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005729 itemlen = PyUnicode_GET_SIZE(item);
5730 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005731 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005732 goto Overflow;
5733 if (i < seqlen - 1) {
5734 new_res_used += seplen;
5735 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005736 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005737 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005738 if (new_res_used > res_alloc) {
5739 /* double allocated size until it's big enough */
5740 do {
5741 res_alloc += res_alloc;
5742 if (res_alloc <= 0)
5743 goto Overflow;
5744 } while (new_res_used > res_alloc);
5745 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5746 Py_DECREF(item);
5747 goto onError;
5748 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005749 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005750 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005751
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005752 /* Copy item, and maybe the separator. */
5753 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5754 res_p += itemlen;
5755 if (i < seqlen - 1) {
5756 Py_UNICODE_COPY(res_p, sep, seplen);
5757 res_p += seplen;
5758 }
5759 Py_DECREF(item);
5760 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005761 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005762
Tim Peters05eba1f2004-08-27 21:32:02 +00005763 /* Shrink res to match the used area; this probably can't fail,
5764 * but it's cheap to check.
5765 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005766 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005767 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005768
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005769 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005770 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005771 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 return (PyObject *)res;
5773
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005774 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005775 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005776 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005777 Py_DECREF(item);
5778 /* fall through */
5779
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005780 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005781 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005782 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005783 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 return NULL;
5785}
5786
Tim Petersced69f82003-09-16 20:30:58 +00005787static
5788PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005789 Py_ssize_t left,
5790 Py_ssize_t right,
5791 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792{
5793 PyUnicodeObject *u;
5794
5795 if (left < 0)
5796 left = 0;
5797 if (right < 0)
5798 right = 0;
5799
Tim Peters7a29bd52001-09-12 03:03:31 +00005800 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 Py_INCREF(self);
5802 return self;
5803 }
5804
Neal Norwitze7d8be82008-07-31 17:17:14 +00005805 if (left > PY_SSIZE_T_MAX - self->length ||
5806 right > PY_SSIZE_T_MAX - (left + self->length)) {
5807 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5808 return NULL;
5809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 u = _PyUnicode_New(left + self->length + right);
5811 if (u) {
5812 if (left)
5813 Py_UNICODE_FILL(u->str, fill, left);
5814 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5815 if (right)
5816 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5817 }
5818
5819 return u;
5820}
5821
Antoine Pitrou64672132010-01-13 07:55:48 +00005822PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
5826 string = PyUnicode_FromObject(string);
5827 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829
Antoine Pitrou64672132010-01-13 07:55:48 +00005830 list = stringlib_splitlines(
5831 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5832 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833
5834 Py_DECREF(string);
5835 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836}
5837
Tim Petersced69f82003-09-16 20:30:58 +00005838static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005840 PyUnicodeObject *substring,
5841 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005844 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005847 return stringlib_split_whitespace(
5848 (PyObject*) self, self->str, self->length, maxcount
5849 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850
Antoine Pitrou64672132010-01-13 07:55:48 +00005851 return stringlib_split(
5852 (PyObject*) self, self->str, self->length,
5853 substring->str, substring->length,
5854 maxcount
5855 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856}
5857
Tim Petersced69f82003-09-16 20:30:58 +00005858static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005859PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005860 PyUnicodeObject *substring,
5861 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005862{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005863 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005864 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005865
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005866 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005867 return stringlib_rsplit_whitespace(
5868 (PyObject*) self, self->str, self->length, maxcount
5869 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005870
Antoine Pitrou64672132010-01-13 07:55:48 +00005871 return stringlib_rsplit(
5872 (PyObject*) self, self->str, self->length,
5873 substring->str, substring->length,
5874 maxcount
5875 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005876}
5877
5878static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005880 PyUnicodeObject *str1,
5881 PyUnicodeObject *str2,
5882 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883{
5884 PyUnicodeObject *u;
5885
5886 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005887 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005888 else if (maxcount == 0 || self->length == 0)
5889 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890
Fredrik Lundh347ee272006-05-24 16:35:18 +00005891 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005892 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005893 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005894 if (str1->length == 0)
5895 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005896 if (str1->length == 1) {
5897 /* replace characters */
5898 Py_UNICODE u1, u2;
5899 if (!findchar(self->str, self->length, str1->str[0]))
5900 goto nothing;
5901 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5902 if (!u)
5903 return NULL;
5904 Py_UNICODE_COPY(u->str, self->str, self->length);
5905 u1 = str1->str[0];
5906 u2 = str2->str[0];
5907 for (i = 0; i < u->length; i++)
5908 if (u->str[i] == u1) {
5909 if (--maxcount < 0)
5910 break;
5911 u->str[i] = u2;
5912 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005914 i = stringlib_find(
5915 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005917 if (i < 0)
5918 goto nothing;
5919 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5920 if (!u)
5921 return NULL;
5922 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005923
5924 /* change everything in-place, starting with this one */
5925 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5926 i += str1->length;
5927
5928 while ( --maxcount > 0) {
5929 i = stringlib_find(self->str+i, self->length-i,
5930 str1->str, str1->length,
5931 i);
5932 if (i == -1)
5933 break;
5934 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5935 i += str1->length;
5936 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005939
Brett Cannona7f13ee2010-05-04 01:16:51 +00005940 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005941 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 Py_UNICODE *p;
5943
5944 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005945 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5946 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005947 if (n == 0)
5948 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005949 /* new_size = self->length + n * (str2->length - str1->length)); */
5950 delta = (str2->length - str1->length);
5951 if (delta == 0) {
5952 new_size = self->length;
5953 } else {
5954 product = n * (str2->length - str1->length);
5955 if ((product / (str2->length - str1->length)) != n) {
5956 PyErr_SetString(PyExc_OverflowError,
5957 "replace string is too long");
5958 return NULL;
5959 }
5960 new_size = self->length + product;
5961 if (new_size < 0) {
5962 PyErr_SetString(PyExc_OverflowError,
5963 "replace string is too long");
5964 return NULL;
5965 }
5966 }
5967 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005968 if (!u)
5969 return NULL;
5970 i = 0;
5971 p = u->str;
5972 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005973 while (n-- > 0) {
5974 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005975 j = stringlib_find(self->str+i, self->length-i,
5976 str1->str, str1->length,
5977 i);
5978 if (j == -1)
5979 break;
5980 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005981 /* copy unchanged part [i:j] */
5982 Py_UNICODE_COPY(p, self->str+i, j-i);
5983 p += j - i;
5984 }
5985 /* copy substitution string */
5986 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005987 Py_UNICODE_COPY(p, str2->str, str2->length);
5988 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005989 }
5990 i = j + str1->length;
5991 }
5992 if (i < self->length)
5993 /* copy tail [i:] */
5994 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005995 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005996 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005997 while (n > 0) {
5998 Py_UNICODE_COPY(p, str2->str, str2->length);
5999 p += str2->length;
6000 if (--n <= 0)
6001 break;
6002 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006004 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 }
6006 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006008
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006009 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006010 /* nothing to replace; return original string (when possible) */
6011 if (PyUnicode_CheckExact(self)) {
6012 Py_INCREF(self);
6013 return (PyObject *) self;
6014 }
6015 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016}
6017
6018/* --- Unicode Object Methods --------------------------------------------- */
6019
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006020PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006021 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022\n\
6023Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006024characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
6026static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006027unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 return fixup(self, fixtitle);
6030}
6031
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006032PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006033 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034\n\
6035Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00006036have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
6038static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006039unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 return fixup(self, fixcapitalize);
6042}
6043
6044#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006045PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006046 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047\n\
6048Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006049normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
6051static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006052unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053{
6054 PyObject *list;
6055 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006056 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 /* Split into words */
6059 list = split(self, NULL, -1);
6060 if (!list)
6061 return NULL;
6062
6063 /* Capitalize each word */
6064 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6065 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006066 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 if (item == NULL)
6068 goto onError;
6069 Py_DECREF(PyList_GET_ITEM(list, i));
6070 PyList_SET_ITEM(list, i, item);
6071 }
6072
6073 /* Join the words to form a new string */
6074 item = PyUnicode_Join(NULL, list);
6075
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006076 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 Py_DECREF(list);
6078 return (PyObject *)item;
6079}
6080#endif
6081
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006082/* Argument converter. Coerces to a single unicode character */
6083
6084static int
6085convert_uc(PyObject *obj, void *addr)
6086{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006087 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6088 PyObject *uniobj;
6089 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006090
Benjamin Peterson857ce152009-01-31 16:29:18 +00006091 uniobj = PyUnicode_FromObject(obj);
6092 if (uniobj == NULL) {
6093 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006094 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006095 return 0;
6096 }
6097 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6098 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006099 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006100 Py_DECREF(uniobj);
6101 return 0;
6102 }
6103 unistr = PyUnicode_AS_UNICODE(uniobj);
6104 *fillcharloc = unistr[0];
6105 Py_DECREF(uniobj);
6106 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006107}
6108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006109PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006110 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006112Return S centered in a Unicode string of length width. Padding is\n\
6113done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114
6115static PyObject *
6116unicode_center(PyUnicodeObject *self, PyObject *args)
6117{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006118 Py_ssize_t marg, left;
6119 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006120 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121
Thomas Woutersde017742006-02-16 19:34:37 +00006122 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 return NULL;
6124
Tim Peters7a29bd52001-09-12 03:03:31 +00006125 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 Py_INCREF(self);
6127 return (PyObject*) self;
6128 }
6129
6130 marg = width - self->length;
6131 left = marg / 2 + (marg & width & 1);
6132
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006133 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134}
6135
Marc-André Lemburge5034372000-08-08 08:04:29 +00006136#if 0
6137
6138/* This code should go into some future Unicode collation support
6139 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006140 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006141
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006142/* speedy UTF-16 code point order comparison */
6143/* gleaned from: */
6144/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6145
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006146static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006147{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006148 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006149 0, 0, 0, 0, 0, 0, 0, 0,
6150 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006151 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006152};
6153
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154static int
6155unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6156{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006157 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006158
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 Py_UNICODE *s1 = str1->str;
6160 Py_UNICODE *s2 = str2->str;
6161
6162 len1 = str1->length;
6163 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006164
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006166 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006167
6168 c1 = *s1++;
6169 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006170
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006171 if (c1 > (1<<11) * 26)
6172 c1 += utf16Fixup[c1>>11];
6173 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006174 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006175 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006176
6177 if (c1 != c2)
6178 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006179
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006180 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 }
6182
6183 return (len1 < len2) ? -1 : (len1 != len2);
6184}
6185
Marc-André Lemburge5034372000-08-08 08:04:29 +00006186#else
6187
6188static int
6189unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6190{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006191 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006192
6193 Py_UNICODE *s1 = str1->str;
6194 Py_UNICODE *s2 = str2->str;
6195
6196 len1 = str1->length;
6197 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006198
Marc-André Lemburge5034372000-08-08 08:04:29 +00006199 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006200 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006201
Fredrik Lundh45714e92001-06-26 16:39:36 +00006202 c1 = *s1++;
6203 c2 = *s2++;
6204
6205 if (c1 != c2)
6206 return (c1 < c2) ? -1 : 1;
6207
Marc-André Lemburge5034372000-08-08 08:04:29 +00006208 len1--; len2--;
6209 }
6210
6211 return (len1 < len2) ? -1 : (len1 != len2);
6212}
6213
6214#endif
6215
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006217 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218{
6219 PyUnicodeObject *u = NULL, *v = NULL;
6220 int result;
6221
6222 /* Coerce the two arguments */
6223 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6224 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6227 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006228 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229
Thomas Wouters7e474022000-07-16 12:04:32 +00006230 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006232 Py_DECREF(u);
6233 Py_DECREF(v);
6234 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 }
6236
6237 result = unicode_compare(u, v);
6238
6239 Py_DECREF(u);
6240 Py_DECREF(v);
6241 return result;
6242
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006243 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 Py_XDECREF(u);
6245 Py_XDECREF(v);
6246 return -1;
6247}
6248
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006249PyObject *PyUnicode_RichCompare(PyObject *left,
6250 PyObject *right,
6251 int op)
6252{
6253 int result;
6254
6255 result = PyUnicode_Compare(left, right);
6256 if (result == -1 && PyErr_Occurred())
6257 goto onError;
6258
6259 /* Convert the return value to a Boolean */
6260 switch (op) {
6261 case Py_EQ:
6262 result = (result == 0);
6263 break;
6264 case Py_NE:
6265 result = (result != 0);
6266 break;
6267 case Py_LE:
6268 result = (result <= 0);
6269 break;
6270 case Py_GE:
6271 result = (result >= 0);
6272 break;
6273 case Py_LT:
6274 result = (result == -1);
6275 break;
6276 case Py_GT:
6277 result = (result == 1);
6278 break;
6279 }
6280 return PyBool_FromLong(result);
6281
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006282 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006283
6284 /* Standard case
6285
6286 Type errors mean that PyUnicode_FromObject() could not convert
6287 one of the arguments (usually the right hand side) to Unicode,
6288 ie. we can't handle the comparison request. However, it is
6289 possible that the other object knows a comparison method, which
6290 is why we return Py_NotImplemented to give the other object a
6291 chance.
6292
6293 */
6294 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6295 PyErr_Clear();
6296 Py_INCREF(Py_NotImplemented);
6297 return Py_NotImplemented;
6298 }
6299 if (op != Py_EQ && op != Py_NE)
6300 return NULL;
6301
6302 /* Equality comparison.
6303
6304 This is a special case: we silence any PyExc_UnicodeDecodeError
6305 and instead turn it into a PyErr_UnicodeWarning.
6306
6307 */
6308 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6309 return NULL;
6310 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006311 if (PyErr_Warn(PyExc_UnicodeWarning,
6312 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006313 "Unicode equal comparison "
6314 "failed to convert both arguments to Unicode - "
6315 "interpreting them as being unequal" :
6316 "Unicode unequal comparison "
6317 "failed to convert both arguments to Unicode - "
6318 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006319 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006320 return NULL;
6321 result = (op == Py_NE);
6322 return PyBool_FromLong(result);
6323}
6324
Guido van Rossum403d68b2000-03-13 15:55:09 +00006325int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006326 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006327{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006328 PyObject *str, *sub;
6329 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006330
6331 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006332 sub = PyUnicode_FromObject(element);
6333 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006334 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006335 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006336
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006337 str = PyUnicode_FromObject(container);
6338 if (!str) {
6339 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006340 return -1;
6341 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006342
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006343 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006344
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006345 Py_DECREF(str);
6346 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006347
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006348 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006349}
6350
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351/* Concat to string or Unicode object giving a new Unicode object. */
6352
6353PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006354 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355{
6356 PyUnicodeObject *u = NULL, *v = NULL, *w;
6357
6358 /* Coerce the two arguments */
6359 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6360 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006361 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6363 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006364 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365
6366 /* Shortcuts */
6367 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006368 Py_DECREF(v);
6369 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 }
6371 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006372 Py_DECREF(u);
6373 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 }
6375
6376 /* Concat the two Unicode strings */
6377 w = _PyUnicode_New(u->length + v->length);
6378 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006379 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 Py_UNICODE_COPY(w->str, u->str, u->length);
6381 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6382
6383 Py_DECREF(u);
6384 Py_DECREF(v);
6385 return (PyObject *)w;
6386
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006387 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 Py_XDECREF(u);
6389 Py_XDECREF(v);
6390 return NULL;
6391}
6392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006393PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006394 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006396Return the number of non-overlapping occurrences of substring sub in\n\
6397Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006398interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399
6400static PyObject *
6401unicode_count(PyUnicodeObject *self, PyObject *args)
6402{
6403 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006404 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006405 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 PyObject *result;
6407
Jesus Cea44e81682011-04-20 16:39:15 +02006408 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6409 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006410 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006411
Antoine Pitrou64672132010-01-13 07:55:48 +00006412 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006413 result = PyInt_FromSsize_t(
6414 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006415 substring->str, substring->length,
6416 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006417 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418
6419 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006420
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 return result;
6422}
6423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006424PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006425 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006427Encodes S using the codec registered for encoding. encoding defaults\n\
6428to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006429handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6431'xmlcharrefreplace' as well as any other name registered with\n\
6432codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433
6434static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006435unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006437 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 char *encoding = NULL;
6439 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006440 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006441
Benjamin Peterson332d7212009-09-18 21:14:55 +00006442 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6443 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006445 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006446 if (v == NULL)
6447 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006448 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006449 PyErr_Format(PyExc_TypeError,
6450 "encoder did not return a string/unicode object "
6451 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006452 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006453 Py_DECREF(v);
6454 return NULL;
6455 }
6456 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006457
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006458 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006459 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006460}
6461
6462PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006463 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006464\n\
6465Decodes S using the codec registered for encoding. encoding defaults\n\
6466to the default encoding. errors may be given to set a different error\n\
6467handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6468a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006469as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006470able to handle UnicodeDecodeErrors.");
6471
6472static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006473unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006474{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006475 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006476 char *encoding = NULL;
6477 char *errors = NULL;
6478 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006479
Benjamin Peterson332d7212009-09-18 21:14:55 +00006480 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6481 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006482 return NULL;
6483 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006484 if (v == NULL)
6485 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006486 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006487 PyErr_Format(PyExc_TypeError,
6488 "decoder did not return a string/unicode object "
6489 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006490 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006491 Py_DECREF(v);
6492 return NULL;
6493 }
6494 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006495
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006496 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498}
6499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006500PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006501 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502\n\
6503Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006504If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505
6506static PyObject*
6507unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6508{
6509 Py_UNICODE *e;
6510 Py_UNICODE *p;
6511 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006512 Py_UNICODE *qe;
6513 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 PyUnicodeObject *u;
6515 int tabsize = 8;
6516
6517 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006518 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519
Thomas Wouters7e474022000-07-16 12:04:32 +00006520 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006521 i = 0; /* chars up to and including most recent \n or \r */
6522 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6523 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524 for (p = self->str; p < e; p++)
6525 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006526 if (tabsize > 0) {
6527 incr = tabsize - (j % tabsize); /* cannot overflow */
6528 if (j > PY_SSIZE_T_MAX - incr)
6529 goto overflow1;
6530 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006531 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006534 if (j > PY_SSIZE_T_MAX - 1)
6535 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 j++;
6537 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006538 if (i > PY_SSIZE_T_MAX - j)
6539 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006541 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 }
6543 }
6544
Guido van Rossum5bdff602008-03-11 21:18:06 +00006545 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006546 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006547
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 /* Second pass: create output string and fill it */
6549 u = _PyUnicode_New(i + j);
6550 if (!u)
6551 return NULL;
6552
Guido van Rossum5bdff602008-03-11 21:18:06 +00006553 j = 0; /* same as in first pass */
6554 q = u->str; /* next output char */
6555 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556
6557 for (p = self->str; p < e; p++)
6558 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006559 if (tabsize > 0) {
6560 i = tabsize - (j % tabsize);
6561 j += i;
6562 while (i--) {
6563 if (q >= qe)
6564 goto overflow2;
6565 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006566 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006567 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006568 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006569 else {
6570 if (q >= qe)
6571 goto overflow2;
6572 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006573 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 if (*p == '\n' || *p == '\r')
6575 j = 0;
6576 }
6577
6578 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006579
6580 overflow2:
6581 Py_DECREF(u);
6582 overflow1:
6583 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585}
6586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006587PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006588 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589\n\
6590Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006591such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592arguments start and end are interpreted as in slice notation.\n\
6593\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006594Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
6596static PyObject *
6597unicode_find(PyUnicodeObject *self, PyObject *args)
6598{
Jesus Cea44e81682011-04-20 16:39:15 +02006599 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006600 Py_ssize_t start;
6601 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006602 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603
Jesus Cea44e81682011-04-20 16:39:15 +02006604 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6605 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006608 result = stringlib_find_slice(
6609 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6610 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6611 start, end
6612 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613
6614 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006615
6616 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617}
6618
6619static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006620unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621{
6622 if (index < 0 || index >= self->length) {
6623 PyErr_SetString(PyExc_IndexError, "string index out of range");
6624 return NULL;
6625 }
6626
6627 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6628}
6629
6630static long
6631unicode_hash(PyUnicodeObject *self)
6632{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006633 /* Since Unicode objects compare equal to their ASCII string
6634 counterparts, they should use the individual character values
6635 as basis for their hash value. This is needed to assure that
6636 strings and Unicode objects behave in the same way as
6637 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638
Martin v. Löwis18e16552006-02-15 17:27:45 +00006639 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006640 register Py_UNICODE *p;
6641 register long x;
6642
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006643#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006644 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006645#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006647 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006648 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006649 /*
6650 We make the hash of the empty string be 0, rather than using
6651 (prefix ^ suffix), since this slightly obfuscates the hash secret
6652 */
6653 if (len == 0) {
6654 self->hash = 0;
6655 return 0;
6656 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006657 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006658 x = _Py_HashSecret.prefix;
6659 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006660 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006661 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006662 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006663 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006664 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006665 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006666 self->hash = x;
6667 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668}
6669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006670PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006671 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006673Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674
6675static PyObject *
6676unicode_index(PyUnicodeObject *self, PyObject *args)
6677{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006678 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006679 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006680 Py_ssize_t start;
6681 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
Jesus Cea44e81682011-04-20 16:39:15 +02006683 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6684 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006687 result = stringlib_find_slice(
6688 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6689 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6690 start, end
6691 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692
6693 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006694
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 if (result < 0) {
6696 PyErr_SetString(PyExc_ValueError, "substring not found");
6697 return NULL;
6698 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006699
Martin v. Löwis18e16552006-02-15 17:27:45 +00006700 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701}
6702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006703PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006704 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006706Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006707at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708
6709static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006710unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711{
6712 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6713 register const Py_UNICODE *e;
6714 int cased;
6715
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 /* Shortcut for single character strings */
6717 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006718 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006720 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006721 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006722 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006723
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 e = p + PyUnicode_GET_SIZE(self);
6725 cased = 0;
6726 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006727 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006728
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006729 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6730 return PyBool_FromLong(0);
6731 else if (!cased && Py_UNICODE_ISLOWER(ch))
6732 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006734 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735}
6736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006737PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006738 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006740Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006741at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742
6743static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006744unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745{
6746 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6747 register const Py_UNICODE *e;
6748 int cased;
6749
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 /* Shortcut for single character strings */
6751 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006752 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006754 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006755 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006756 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006757
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 e = p + PyUnicode_GET_SIZE(self);
6759 cased = 0;
6760 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006761 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006762
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006763 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6764 return PyBool_FromLong(0);
6765 else if (!cased && Py_UNICODE_ISUPPER(ch))
6766 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006768 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769}
6770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006771PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006772 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006774Return True if S is a titlecased string and there is at least one\n\
6775character in S, i.e. upper- and titlecase characters may only\n\
6776follow uncased characters and lowercase characters only cased ones.\n\
6777Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778
6779static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006780unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781{
6782 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6783 register const Py_UNICODE *e;
6784 int cased, previous_is_cased;
6785
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 /* Shortcut for single character strings */
6787 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006788 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6789 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006791 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006792 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006793 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006794
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 e = p + PyUnicode_GET_SIZE(self);
6796 cased = 0;
6797 previous_is_cased = 0;
6798 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006799 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006800
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006801 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6802 if (previous_is_cased)
6803 return PyBool_FromLong(0);
6804 previous_is_cased = 1;
6805 cased = 1;
6806 }
6807 else if (Py_UNICODE_ISLOWER(ch)) {
6808 if (!previous_is_cased)
6809 return PyBool_FromLong(0);
6810 previous_is_cased = 1;
6811 cased = 1;
6812 }
6813 else
6814 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006816 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817}
6818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006819PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006820 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006822Return True if all characters in S are whitespace\n\
6823and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824
6825static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006826unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827{
6828 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6829 register const Py_UNICODE *e;
6830
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 /* Shortcut for single character strings */
6832 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006833 Py_UNICODE_ISSPACE(*p))
6834 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006836 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006837 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006838 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006839
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 e = p + PyUnicode_GET_SIZE(self);
6841 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006842 if (!Py_UNICODE_ISSPACE(*p))
6843 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006845 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846}
6847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006848PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006849 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006850\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006851Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006852and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006853
6854static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006855unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006856{
6857 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6858 register const Py_UNICODE *e;
6859
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006860 /* Shortcut for single character strings */
6861 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006862 Py_UNICODE_ISALPHA(*p))
6863 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006864
6865 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006866 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006867 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006868
6869 e = p + PyUnicode_GET_SIZE(self);
6870 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006871 if (!Py_UNICODE_ISALPHA(*p))
6872 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006873 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006874 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006875}
6876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006877PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006878 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006879\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006880Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006881and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006882
6883static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006884unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006885{
6886 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6887 register const Py_UNICODE *e;
6888
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006889 /* Shortcut for single character strings */
6890 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006891 Py_UNICODE_ISALNUM(*p))
6892 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006893
6894 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006895 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006896 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006897
6898 e = p + PyUnicode_GET_SIZE(self);
6899 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006900 if (!Py_UNICODE_ISALNUM(*p))
6901 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006902 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006903 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006904}
6905
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006906PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006907 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006909Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006910False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911
6912static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006913unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914{
6915 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6916 register const Py_UNICODE *e;
6917
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918 /* Shortcut for single character strings */
6919 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006920 Py_UNICODE_ISDECIMAL(*p))
6921 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006923 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006924 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006925 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006926
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 e = p + PyUnicode_GET_SIZE(self);
6928 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006929 if (!Py_UNICODE_ISDECIMAL(*p))
6930 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006932 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933}
6934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006935PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006936 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006938Return True if all characters in S are digits\n\
6939and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940
6941static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006942unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943{
6944 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6945 register const Py_UNICODE *e;
6946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 /* Shortcut for single character strings */
6948 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006949 Py_UNICODE_ISDIGIT(*p))
6950 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006952 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006953 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006954 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006955
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 e = p + PyUnicode_GET_SIZE(self);
6957 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006958 if (!Py_UNICODE_ISDIGIT(*p))
6959 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006961 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962}
6963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006964PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006965 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006967Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006968False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969
6970static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006971unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972{
6973 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6974 register const Py_UNICODE *e;
6975
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976 /* Shortcut for single character strings */
6977 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006978 Py_UNICODE_ISNUMERIC(*p))
6979 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006981 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006982 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006983 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006984
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 e = p + PyUnicode_GET_SIZE(self);
6986 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006987 if (!Py_UNICODE_ISNUMERIC(*p))
6988 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006990 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991}
6992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006993PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006994 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995\n\
6996Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006997iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998
6999static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007000unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007002 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003}
7004
Martin v. Löwis18e16552006-02-15 17:27:45 +00007005static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006unicode_length(PyUnicodeObject *self)
7007{
7008 return self->length;
7009}
7010
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007011PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007012 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007014Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007015done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016
7017static PyObject *
7018unicode_ljust(PyUnicodeObject *self, PyObject *args)
7019{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007020 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007021 Py_UNICODE fillchar = ' ';
7022
Martin v. Löwis412fb672006-04-13 06:34:32 +00007023 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 return NULL;
7025
Tim Peters7a29bd52001-09-12 03:03:31 +00007026 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027 Py_INCREF(self);
7028 return (PyObject*) self;
7029 }
7030
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007031 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032}
7033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007034PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007035 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007037Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038
7039static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007040unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042 return fixup(self, fixlower);
7043}
7044
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007045#define LEFTSTRIP 0
7046#define RIGHTSTRIP 1
7047#define BOTHSTRIP 2
7048
7049/* Arrays indexed by above */
7050static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7051
7052#define STRIPNAME(i) (stripformat[i]+3)
7053
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007054/* externally visible for str.strip(unicode) */
7055PyObject *
7056_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7057{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007058 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7059 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7060 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7061 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7062 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007063
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007064 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007065
Benjamin Peterson857ce152009-01-31 16:29:18 +00007066 i = 0;
7067 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007068 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7069 i++;
7070 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007071 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007072
Benjamin Peterson857ce152009-01-31 16:29:18 +00007073 j = len;
7074 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007075 do {
7076 j--;
7077 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7078 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007079 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007080
Benjamin Peterson857ce152009-01-31 16:29:18 +00007081 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007082 Py_INCREF(self);
7083 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007084 }
7085 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007086 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007087}
7088
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089
7090static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007091do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007093 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7094 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007095
Benjamin Peterson857ce152009-01-31 16:29:18 +00007096 i = 0;
7097 if (striptype != RIGHTSTRIP) {
7098 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7099 i++;
7100 }
7101 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007102
Benjamin Peterson857ce152009-01-31 16:29:18 +00007103 j = len;
7104 if (striptype != LEFTSTRIP) {
7105 do {
7106 j--;
7107 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7108 j++;
7109 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007110
Benjamin Peterson857ce152009-01-31 16:29:18 +00007111 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7112 Py_INCREF(self);
7113 return (PyObject*)self;
7114 }
7115 else
7116 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117}
7118
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007119
7120static PyObject *
7121do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7122{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007123 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007124
Benjamin Peterson857ce152009-01-31 16:29:18 +00007125 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7126 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007127
Benjamin Peterson857ce152009-01-31 16:29:18 +00007128 if (sep != NULL && sep != Py_None) {
7129 if (PyUnicode_Check(sep))
7130 return _PyUnicode_XStrip(self, striptype, sep);
7131 else if (PyString_Check(sep)) {
7132 PyObject *res;
7133 sep = PyUnicode_FromObject(sep);
7134 if (sep==NULL)
7135 return NULL;
7136 res = _PyUnicode_XStrip(self, striptype, sep);
7137 Py_DECREF(sep);
7138 return res;
7139 }
7140 else {
7141 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007142 "%s arg must be None, unicode or str",
7143 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007144 return NULL;
7145 }
7146 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007147
Benjamin Peterson857ce152009-01-31 16:29:18 +00007148 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007149}
7150
7151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007152PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007153 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007154\n\
7155Return a copy of the string S with leading and trailing\n\
7156whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007157If chars is given and not None, remove characters in chars instead.\n\
7158If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007159
7160static PyObject *
7161unicode_strip(PyUnicodeObject *self, PyObject *args)
7162{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007163 if (PyTuple_GET_SIZE(args) == 0)
7164 return do_strip(self, BOTHSTRIP); /* Common case */
7165 else
7166 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007167}
7168
7169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007170PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007171 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007172\n\
7173Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007174If chars is given and not None, remove characters in chars instead.\n\
7175If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007176
7177static PyObject *
7178unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7179{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007180 if (PyTuple_GET_SIZE(args) == 0)
7181 return do_strip(self, LEFTSTRIP); /* Common case */
7182 else
7183 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007184}
7185
7186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007187PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007188 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007189\n\
7190Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007191If chars is given and not None, remove characters in chars instead.\n\
7192If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007193
7194static PyObject *
7195unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7196{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007197 if (PyTuple_GET_SIZE(args) == 0)
7198 return do_strip(self, RIGHTSTRIP); /* Common case */
7199 else
7200 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007201}
7202
7203
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007205unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206{
7207 PyUnicodeObject *u;
7208 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007209 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007210 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211
7212 if (len < 0)
7213 len = 0;
7214
Tim Peters7a29bd52001-09-12 03:03:31 +00007215 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216 /* no repeat, return original string */
7217 Py_INCREF(str);
7218 return (PyObject*) str;
7219 }
Tim Peters8f422462000-09-09 06:13:41 +00007220
7221 /* ensure # of chars needed doesn't overflow int and # of bytes
7222 * needed doesn't overflow size_t
7223 */
7224 nchars = len * str->length;
7225 if (len && nchars / len != str->length) {
7226 PyErr_SetString(PyExc_OverflowError,
7227 "repeated string is too long");
7228 return NULL;
7229 }
7230 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7231 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7232 PyErr_SetString(PyExc_OverflowError,
7233 "repeated string is too long");
7234 return NULL;
7235 }
7236 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237 if (!u)
7238 return NULL;
7239
7240 p = u->str;
7241
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007242 if (str->length == 1 && len > 0) {
7243 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007244 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007245 Py_ssize_t done = 0; /* number of characters copied this far */
7246 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007247 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007248 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007249 }
7250 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007251 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007252 Py_UNICODE_COPY(p+done, p, n);
7253 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007254 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256
7257 return (PyObject*) u;
7258}
7259
7260PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007261 PyObject *subobj,
7262 PyObject *replobj,
7263 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264{
7265 PyObject *self;
7266 PyObject *str1;
7267 PyObject *str2;
7268 PyObject *result;
7269
7270 self = PyUnicode_FromObject(obj);
7271 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 str1 = PyUnicode_FromObject(subobj);
7274 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007275 Py_DECREF(self);
7276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 }
7278 str2 = PyUnicode_FromObject(replobj);
7279 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007280 Py_DECREF(self);
7281 Py_DECREF(str1);
7282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 }
Tim Petersced69f82003-09-16 20:30:58 +00007284 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007285 (PyUnicodeObject *)str1,
7286 (PyUnicodeObject *)str2,
7287 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288 Py_DECREF(self);
7289 Py_DECREF(str1);
7290 Py_DECREF(str2);
7291 return result;
7292}
7293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007294PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007295 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296\n\
7297Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007298old replaced by new. If the optional argument count is\n\
7299given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300
7301static PyObject*
7302unicode_replace(PyUnicodeObject *self, PyObject *args)
7303{
7304 PyUnicodeObject *str1;
7305 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007306 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 PyObject *result;
7308
Martin v. Löwis18e16552006-02-15 17:27:45 +00007309 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 return NULL;
7311 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7312 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007315 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007316 Py_DECREF(str1);
7317 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319
7320 result = replace(self, str1, str2, maxcount);
7321
7322 Py_DECREF(str1);
7323 Py_DECREF(str2);
7324 return result;
7325}
7326
7327static
7328PyObject *unicode_repr(PyObject *unicode)
7329{
7330 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007331 PyUnicode_GET_SIZE(unicode),
7332 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333}
7334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007335PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007336 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337\n\
7338Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007339such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340arguments start and end are interpreted as in slice notation.\n\
7341\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007342Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343
7344static PyObject *
7345unicode_rfind(PyUnicodeObject *self, PyObject *args)
7346{
Jesus Cea44e81682011-04-20 16:39:15 +02007347 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007348 Py_ssize_t start;
7349 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007350 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351
Jesus Cea44e81682011-04-20 16:39:15 +02007352 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7353 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007356 result = stringlib_rfind_slice(
7357 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7358 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7359 start, end
7360 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361
7362 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007363
7364 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365}
7366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007367PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007368 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007370Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371
7372static PyObject *
7373unicode_rindex(PyUnicodeObject *self, PyObject *args)
7374{
Jesus Cea44e81682011-04-20 16:39:15 +02007375 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007376 Py_ssize_t start;
7377 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007378 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379
Jesus Cea44e81682011-04-20 16:39:15 +02007380 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7381 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007382 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007384 result = stringlib_rfind_slice(
7385 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7386 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7387 start, end
7388 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
7390 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007391
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392 if (result < 0) {
7393 PyErr_SetString(PyExc_ValueError, "substring not found");
7394 return NULL;
7395 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007396 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397}
7398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007399PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007400 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007402Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007403done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404
7405static PyObject *
7406unicode_rjust(PyUnicodeObject *self, PyObject *args)
7407{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007408 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007409 Py_UNICODE fillchar = ' ';
7410
Martin v. Löwis412fb672006-04-13 06:34:32 +00007411 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412 return NULL;
7413
Tim Peters7a29bd52001-09-12 03:03:31 +00007414 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 Py_INCREF(self);
7416 return (PyObject*) self;
7417 }
7418
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007419 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420}
7421
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007423unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424{
7425 /* standard clamping */
7426 if (start < 0)
7427 start = 0;
7428 if (end < 0)
7429 end = 0;
7430 if (end > self->length)
7431 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007432 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433 /* full slice, return original string */
7434 Py_INCREF(self);
7435 return (PyObject*) self;
7436 }
7437 if (start > end)
7438 start = end;
7439 /* copy slice */
7440 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007441 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442}
7443
7444PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007445 PyObject *sep,
7446 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447{
7448 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007449
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 s = PyUnicode_FromObject(s);
7451 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007452 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007453 if (sep != NULL) {
7454 sep = PyUnicode_FromObject(sep);
7455 if (sep == NULL) {
7456 Py_DECREF(s);
7457 return NULL;
7458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 }
7460
7461 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7462
7463 Py_DECREF(s);
7464 Py_XDECREF(sep);
7465 return result;
7466}
7467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007468PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007469 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470\n\
7471Return a list of the words in S, using sep as the\n\
7472delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007473splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007474whitespace string is a separator and empty strings are\n\
7475removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476
7477static PyObject*
7478unicode_split(PyUnicodeObject *self, PyObject *args)
7479{
7480 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007481 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482
Martin v. Löwis18e16552006-02-15 17:27:45 +00007483 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 return NULL;
7485
7486 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007487 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007489 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007491 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492}
7493
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007494PyObject *
7495PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7496{
7497 PyObject* str_obj;
7498 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007499 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007500
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007501 str_obj = PyUnicode_FromObject(str_in);
7502 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007503 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007504 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007505 if (!sep_obj) {
7506 Py_DECREF(str_obj);
7507 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007508 }
7509
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007510 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007511 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7512 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7513 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007514
Fredrik Lundhb9479482006-05-26 17:22:38 +00007515 Py_DECREF(sep_obj);
7516 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007517
7518 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007519}
7520
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007521
7522PyObject *
7523PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7524{
7525 PyObject* str_obj;
7526 PyObject* sep_obj;
7527 PyObject* out;
7528
7529 str_obj = PyUnicode_FromObject(str_in);
7530 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007531 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007532 sep_obj = PyUnicode_FromObject(sep_in);
7533 if (!sep_obj) {
7534 Py_DECREF(str_obj);
7535 return NULL;
7536 }
7537
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007538 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007539 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7540 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7541 );
7542
7543 Py_DECREF(sep_obj);
7544 Py_DECREF(str_obj);
7545
7546 return out;
7547}
7548
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007549PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007550 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007551\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007552Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007553the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007554found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007555
7556static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007557unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007558{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007559 return PyUnicode_Partition((PyObject *)self, separator);
7560}
7561
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007562PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007563 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007564\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007565Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007566the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007567separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007568
7569static PyObject*
7570unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7571{
7572 return PyUnicode_RPartition((PyObject *)self, separator);
7573}
7574
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007575PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007576 PyObject *sep,
7577 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007578{
7579 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007580
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007581 s = PyUnicode_FromObject(s);
7582 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007583 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007584 if (sep != NULL) {
7585 sep = PyUnicode_FromObject(sep);
7586 if (sep == NULL) {
7587 Py_DECREF(s);
7588 return NULL;
7589 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007590 }
7591
7592 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7593
7594 Py_DECREF(s);
7595 Py_XDECREF(sep);
7596 return result;
7597}
7598
7599PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007600 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007601\n\
7602Return a list of the words in S, using sep as the\n\
7603delimiter string, starting at the end of the string and\n\
7604working to the front. If maxsplit is given, at most maxsplit\n\
7605splits are done. If sep is not specified, any whitespace string\n\
7606is a separator.");
7607
7608static PyObject*
7609unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7610{
7611 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007612 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007613
Martin v. Löwis18e16552006-02-15 17:27:45 +00007614 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007615 return NULL;
7616
7617 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007618 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007619 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007620 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007621 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007622 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007623}
7624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007625PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007626 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627\n\
7628Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007629Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007630is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631
7632static PyObject*
7633unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7634{
Guido van Rossum86662912000-04-11 15:38:46 +00007635 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636
Guido van Rossum86662912000-04-11 15:38:46 +00007637 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 return NULL;
7639
Guido van Rossum86662912000-04-11 15:38:46 +00007640 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641}
7642
7643static
7644PyObject *unicode_str(PyUnicodeObject *self)
7645{
Fred Drakee4315f52000-05-09 19:53:39 +00007646 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647}
7648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007649PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007650 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651\n\
7652Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007653and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654
7655static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007656unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658 return fixup(self, fixswapcase);
7659}
7660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007661PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007662 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663\n\
7664Return a copy of the string S, where all characters have been mapped\n\
7665through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007666Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7667Unmapped characters are left untouched. Characters mapped to None\n\
7668are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669
7670static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007671unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672{
Tim Petersced69f82003-09-16 20:30:58 +00007673 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007674 self->length,
7675 table,
7676 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677}
7678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007679PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007680 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007682Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683
7684static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007685unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687 return fixup(self, fixupper);
7688}
7689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007690PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007691 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692\n\
Georg Brandl98064072008-09-09 19:26:00 +00007693Pad a numeric string S with zeros on the left, to fill a field\n\
7694of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695
7696static PyObject *
7697unicode_zfill(PyUnicodeObject *self, PyObject *args)
7698{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007699 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 PyUnicodeObject *u;
7701
Martin v. Löwis18e16552006-02-15 17:27:45 +00007702 Py_ssize_t width;
7703 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704 return NULL;
7705
7706 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007707 if (PyUnicode_CheckExact(self)) {
7708 Py_INCREF(self);
7709 return (PyObject*) self;
7710 }
7711 else
7712 return PyUnicode_FromUnicode(
7713 PyUnicode_AS_UNICODE(self),
7714 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007715 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 }
7717
7718 fill = width - self->length;
7719
7720 u = pad(self, fill, 0, '0');
7721
Walter Dörwald068325e2002-04-15 13:36:47 +00007722 if (u == NULL)
7723 return NULL;
7724
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 if (u->str[fill] == '+' || u->str[fill] == '-') {
7726 /* move sign to beginning of string */
7727 u->str[0] = u->str[fill];
7728 u->str[fill] = '0';
7729 }
7730
7731 return (PyObject*) u;
7732}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733
7734#if 0
7735static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007736free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007738 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739}
7740#endif
7741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007742PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007743 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007745Return True if S starts with the specified prefix, False otherwise.\n\
7746With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007747With optional end, stop comparing S at that position.\n\
7748prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749
7750static PyObject *
7751unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007752 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753{
Georg Brandl24250812006-06-09 18:45:48 +00007754 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007756 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007757 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007758 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759
Jesus Cea44e81682011-04-20 16:39:15 +02007760 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007761 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007762 if (PyTuple_Check(subobj)) {
7763 Py_ssize_t i;
7764 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7765 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007766 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007767 if (substring == NULL)
7768 return NULL;
7769 result = tailmatch(self, substring, start, end, -1);
7770 Py_DECREF(substring);
7771 if (result) {
7772 Py_RETURN_TRUE;
7773 }
7774 }
7775 /* nothing matched */
7776 Py_RETURN_FALSE;
7777 }
7778 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007779 if (substring == NULL) {
7780 if (PyErr_ExceptionMatches(PyExc_TypeError))
7781 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7782 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007783 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007784 }
Georg Brandl24250812006-06-09 18:45:48 +00007785 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007787 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788}
7789
7790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007791PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007792 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007794Return True if S ends with the specified suffix, False otherwise.\n\
7795With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007796With optional end, stop comparing S at that position.\n\
7797suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798
7799static PyObject *
7800unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007801 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802{
Georg Brandl24250812006-06-09 18:45:48 +00007803 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007805 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007806 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007807 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808
Jesus Cea44e81682011-04-20 16:39:15 +02007809 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007810 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007811 if (PyTuple_Check(subobj)) {
7812 Py_ssize_t i;
7813 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7814 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007815 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007816 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007817 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007818 result = tailmatch(self, substring, start, end, +1);
7819 Py_DECREF(substring);
7820 if (result) {
7821 Py_RETURN_TRUE;
7822 }
7823 }
7824 Py_RETURN_FALSE;
7825 }
7826 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007827 if (substring == NULL) {
7828 if (PyErr_ExceptionMatches(PyExc_TypeError))
7829 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7830 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007831 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007832 }
Georg Brandl24250812006-06-09 18:45:48 +00007833 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007835 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836}
7837
7838
Eric Smitha9f7d622008-02-17 19:46:49 +00007839/* Implements do_string_format, which is unicode because of stringlib */
7840#include "stringlib/string_format.h"
7841
7842PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007843 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007844\n\
Eric Smith6c840852010-11-06 19:43:44 +00007845Return a formatted version of S, using substitutions from args and kwargs.\n\
7846The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007847
Eric Smithdc13b792008-05-30 18:10:04 +00007848static PyObject *
7849unicode__format__(PyObject *self, PyObject *args)
7850{
7851 PyObject *format_spec;
7852 PyObject *result = NULL;
7853 PyObject *tmp = NULL;
7854
7855 /* If 2.x, convert format_spec to the same type as value */
7856 /* This is to allow things like u''.format('') */
7857 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7858 goto done;
7859 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7860 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007861 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007862 goto done;
7863 }
7864 tmp = PyObject_Unicode(format_spec);
7865 if (tmp == NULL)
7866 goto done;
7867 format_spec = tmp;
7868
7869 result = _PyUnicode_FormatAdvanced(self,
7870 PyUnicode_AS_UNICODE(format_spec),
7871 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007872 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007873 Py_XDECREF(tmp);
7874 return result;
7875}
7876
Eric Smitha9f7d622008-02-17 19:46:49 +00007877PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007878 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007879\n\
Eric Smith6c840852010-11-06 19:43:44 +00007880Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007881
Robert Schuppenies901c9972008-06-10 10:10:31 +00007882static PyObject *
7883unicode__sizeof__(PyUnicodeObject *v)
7884{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007885 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7886 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007887}
7888
7889PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007890 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007891\n\
7892");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007893
7894static PyObject *
7895unicode_getnewargs(PyUnicodeObject *v)
7896{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007897 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007898}
7899
7900
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007902 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007903 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7904 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007905 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007906 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7907 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7908 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7909 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7910 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7911 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7912 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007913 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007914 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7915 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7916 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007917 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007918 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007919/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7920 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7921 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7922 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007923 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007924 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007925 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007926 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007927 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7928 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7929 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7930 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7931 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7932 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7933 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7934 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7935 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7936 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7937 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7938 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7939 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7940 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007941 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007942 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7943 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7944 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7945 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007946 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007947#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007948 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949#endif
7950
7951#if 0
7952 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007953 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954#endif
7955
Benjamin Peterson857ce152009-01-31 16:29:18 +00007956 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957 {NULL, NULL}
7958};
7959
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007960static PyObject *
7961unicode_mod(PyObject *v, PyObject *w)
7962{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007963 if (!PyUnicode_Check(v)) {
7964 Py_INCREF(Py_NotImplemented);
7965 return Py_NotImplemented;
7966 }
7967 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007968}
7969
7970static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007971 0, /*nb_add*/
7972 0, /*nb_subtract*/
7973 0, /*nb_multiply*/
7974 0, /*nb_divide*/
7975 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007976};
7977
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007979 (lenfunc) unicode_length, /* sq_length */
7980 PyUnicode_Concat, /* sq_concat */
7981 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7982 (ssizeargfunc) unicode_getitem, /* sq_item */
7983 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7984 0, /* sq_ass_item */
7985 0, /* sq_ass_slice */
7986 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987};
7988
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007989static PyObject*
7990unicode_subscript(PyUnicodeObject* self, PyObject* item)
7991{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007992 if (PyIndex_Check(item)) {
7993 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007994 if (i == -1 && PyErr_Occurred())
7995 return NULL;
7996 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007997 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007998 return unicode_getitem(self, i);
7999 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008000 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008001 Py_UNICODE* source_buf;
8002 Py_UNICODE* result_buf;
8003 PyObject* result;
8004
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008005 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008006 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008007 return NULL;
8008 }
8009
8010 if (slicelength <= 0) {
8011 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008012 } else if (start == 0 && step == 1 && slicelength == self->length &&
8013 PyUnicode_CheckExact(self)) {
8014 Py_INCREF(self);
8015 return (PyObject *)self;
8016 } else if (step == 1) {
8017 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008018 } else {
8019 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008020 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8021 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008022
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008023 if (result_buf == NULL)
8024 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008025
8026 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8027 result_buf[i] = source_buf[cur];
8028 }
Tim Petersced69f82003-09-16 20:30:58 +00008029
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008030 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008031 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008032 return result;
8033 }
8034 } else {
8035 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8036 return NULL;
8037 }
8038}
8039
8040static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008041 (lenfunc)unicode_length, /* mp_length */
8042 (binaryfunc)unicode_subscript, /* mp_subscript */
8043 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008044};
8045
Martin v. Löwis18e16552006-02-15 17:27:45 +00008046static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008048 Py_ssize_t index,
8049 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050{
8051 if (index != 0) {
8052 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008053 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 return -1;
8055 }
8056 *ptr = (void *) self->str;
8057 return PyUnicode_GET_DATA_SIZE(self);
8058}
8059
Martin v. Löwis18e16552006-02-15 17:27:45 +00008060static Py_ssize_t
8061unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008062 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063{
8064 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008065 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066 return -1;
8067}
8068
8069static int
8070unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008071 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072{
8073 if (lenp)
8074 *lenp = PyUnicode_GET_DATA_SIZE(self);
8075 return 1;
8076}
8077
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008078static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008080 Py_ssize_t index,
8081 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082{
8083 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008084
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 if (index != 0) {
8086 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008087 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088 return -1;
8089 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008090 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008092 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008093 *ptr = (void *) PyString_AS_STRING(str);
8094 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095}
8096
8097/* Helpers for PyUnicode_Format() */
8098
8099static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008100getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008102 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008104 (*p_argidx)++;
8105 if (arglen < 0)
8106 return args;
8107 else
8108 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109 }
8110 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008111 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112 return NULL;
8113}
8114
8115#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008116#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008118#define F_ALT (1<<3)
8119#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120
Martin v. Löwis18e16552006-02-15 17:27:45 +00008121static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008122strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008124 register Py_ssize_t i;
8125 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008127 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 return len;
8130}
8131
Neal Norwitzfc76d632006-01-10 06:03:13 +00008132static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008133longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8134{
Tim Peters15231542006-02-16 01:08:01 +00008135 Py_ssize_t result;
8136
Neal Norwitzfc76d632006-01-10 06:03:13 +00008137 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008138 result = strtounicode(buffer, (char *)buffer);
8139 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008140}
8141
Guido van Rossum078151d2002-08-11 04:24:12 +00008142/* XXX To save some code duplication, formatfloat/long/int could have been
8143 shared with stringobject.c, converting from 8-bit to Unicode after the
8144 formatting is done. */
8145
Mark Dickinson18cfada2009-11-23 18:46:41 +00008146/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8147
8148static PyObject *
8149formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008151 char *p;
8152 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008154
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155 x = PyFloat_AsDouble(v);
8156 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008157 return NULL;
8158
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008160 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008161
Mark Dickinson18cfada2009-11-23 18:46:41 +00008162 p = PyOS_double_to_string(x, type, prec,
8163 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8164 if (p == NULL)
8165 return NULL;
8166 result = PyUnicode_FromStringAndSize(p, strlen(p));
8167 PyMem_Free(p);
8168 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169}
8170
Tim Peters38fd5b62000-09-21 05:43:11 +00008171static PyObject*
8172formatlong(PyObject *val, int flags, int prec, int type)
8173{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008174 char *buf;
8175 int i, len;
8176 PyObject *str; /* temporary string object. */
8177 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008178
Benjamin Peterson857ce152009-01-31 16:29:18 +00008179 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8180 if (!str)
8181 return NULL;
8182 result = _PyUnicode_New(len);
8183 if (!result) {
8184 Py_DECREF(str);
8185 return NULL;
8186 }
8187 for (i = 0; i < len; i++)
8188 result->str[i] = buf[i];
8189 result->str[len] = 0;
8190 Py_DECREF(str);
8191 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008192}
8193
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194static int
8195formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008196 size_t buflen,
8197 int flags,
8198 int prec,
8199 int type,
8200 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008202 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008203 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8204 * + 1 + 1
8205 * = 24
8206 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008207 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008208 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209 long x;
8210
8211 x = PyInt_AsLong(v);
8212 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008213 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008214 if (x < 0 && type == 'u') {
8215 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008216 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008217 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8218 sign = "-";
8219 else
8220 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008222 prec = 1;
8223
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008224 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8225 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008226 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008227 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008228 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008229 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008230 return -1;
8231 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008232
8233 if ((flags & F_ALT) &&
8234 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008235 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008236 * of issues that cause pain:
8237 * - when 0 is being converted, the C standard leaves off
8238 * the '0x' or '0X', which is inconsistent with other
8239 * %#x/%#X conversions and inconsistent with Python's
8240 * hex() function
8241 * - there are platforms that violate the standard and
8242 * convert 0 with the '0x' or '0X'
8243 * (Metrowerks, Compaq Tru64)
8244 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008245 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008246 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008247 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008248 * We can achieve the desired consistency by inserting our
8249 * own '0x' or '0X' prefix, and substituting %x/%X in place
8250 * of %#x/%#X.
8251 *
8252 * Note that this is the same approach as used in
8253 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008254 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008255 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8256 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008257 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008258 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008259 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8260 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008261 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008262 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008263 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008264 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008265 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008266 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267}
8268
8269static int
8270formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008271 size_t buflen,
8272 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273{
Ezio Melotti32125152010-02-25 17:36:04 +00008274 PyObject *unistr;
8275 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008276 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008277 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008278 if (PyUnicode_GET_SIZE(v) != 1)
8279 goto onError;
8280 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008281 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008283 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008284 if (PyString_GET_SIZE(v) != 1)
8285 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008286 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8287 with a UnicodeDecodeError if 'char' is not decodable with the
8288 default encoding (usually ASCII, but it might be something else) */
8289 str = PyString_AS_STRING(v);
8290 if ((unsigned char)str[0] > 0x7F) {
8291 /* the char is not ASCII; try to decode the string using the
8292 default encoding and return -1 to let the UnicodeDecodeError
8293 be raised if the string can't be decoded */
8294 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8295 if (unistr == NULL)
8296 return -1;
8297 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8298 Py_DECREF(unistr);
8299 }
8300 else
8301 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303
8304 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008305 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008307 x = PyInt_AsLong(v);
8308 if (x == -1 && PyErr_Occurred())
8309 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008310#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008311 if (x < 0 || x > 0x10ffff) {
8312 PyErr_SetString(PyExc_OverflowError,
8313 "%c arg not in range(0x110000) "
8314 "(wide Python build)");
8315 return -1;
8316 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008317#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008318 if (x < 0 || x > 0xffff) {
8319 PyErr_SetString(PyExc_OverflowError,
8320 "%c arg not in range(0x10000) "
8321 "(narrow Python build)");
8322 return -1;
8323 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008324#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008325 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 }
8327 buf[1] = '\0';
8328 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008329
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008330 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008331 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008332 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008333 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334}
8335
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008336/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8337
Mark Dickinson18cfada2009-11-23 18:46:41 +00008338 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008339 chars are formatted. XXX This is a magic number. Each formatting
8340 routine does bounds checking to ensure no overflow, but a better
8341 solution may be to malloc a buffer of appropriate size for each
8342 format. For now, the current solution is sufficient.
8343*/
8344#define FORMATBUFLEN (size_t)120
8345
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008347 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348{
8349 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008350 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 int args_owned = 0;
8352 PyUnicodeObject *result = NULL;
8353 PyObject *dict = NULL;
8354 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008355
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008357 PyErr_BadInternalCall();
8358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359 }
8360 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008361 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 fmt = PyUnicode_AS_UNICODE(uformat);
8364 fmtcnt = PyUnicode_GET_SIZE(uformat);
8365
8366 reslen = rescnt = fmtcnt + 100;
8367 result = _PyUnicode_New(reslen);
8368 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008369 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 res = PyUnicode_AS_UNICODE(result);
8371
8372 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008373 arglen = PyTuple_Size(args);
8374 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 }
8376 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008377 arglen = -1;
8378 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379 }
Benjamin Petersonda2c7eb2013-03-23 22:32:00 -05008380 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8381 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008382 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383
8384 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008385 if (*fmt != '%') {
8386 if (--rescnt < 0) {
8387 rescnt = fmtcnt + 100;
8388 reslen += rescnt;
8389 if (_PyUnicode_Resize(&result, reslen) < 0)
8390 goto onError;
8391 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8392 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008393 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008394 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008395 }
8396 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008397 /* Got a format specifier */
8398 int flags = 0;
8399 Py_ssize_t width = -1;
8400 int prec = -1;
8401 Py_UNICODE c = '\0';
8402 Py_UNICODE fill;
8403 int isnumok;
8404 PyObject *v = NULL;
8405 PyObject *temp = NULL;
8406 Py_UNICODE *pbuf;
8407 Py_UNICODE sign;
8408 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008409 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008410
8411 fmt++;
8412 if (*fmt == '(') {
8413 Py_UNICODE *keystart;
8414 Py_ssize_t keylen;
8415 PyObject *key;
8416 int pcount = 1;
8417
8418 if (dict == NULL) {
8419 PyErr_SetString(PyExc_TypeError,
8420 "format requires a mapping");
8421 goto onError;
8422 }
8423 ++fmt;
8424 --fmtcnt;
8425 keystart = fmt;
8426 /* Skip over balanced parentheses */
8427 while (pcount > 0 && --fmtcnt >= 0) {
8428 if (*fmt == ')')
8429 --pcount;
8430 else if (*fmt == '(')
8431 ++pcount;
8432 fmt++;
8433 }
8434 keylen = fmt - keystart - 1;
8435 if (fmtcnt < 0 || pcount > 0) {
8436 PyErr_SetString(PyExc_ValueError,
8437 "incomplete format key");
8438 goto onError;
8439 }
8440#if 0
8441 /* keys are converted to strings using UTF-8 and
8442 then looked up since Python uses strings to hold
8443 variables names etc. in its namespaces and we
8444 wouldn't want to break common idioms. */
8445 key = PyUnicode_EncodeUTF8(keystart,
8446 keylen,
8447 NULL);
8448#else
8449 key = PyUnicode_FromUnicode(keystart, keylen);
8450#endif
8451 if (key == NULL)
8452 goto onError;
8453 if (args_owned) {
8454 Py_DECREF(args);
8455 args_owned = 0;
8456 }
8457 args = PyObject_GetItem(dict, key);
8458 Py_DECREF(key);
8459 if (args == NULL) {
8460 goto onError;
8461 }
8462 args_owned = 1;
8463 arglen = -1;
8464 argidx = -2;
8465 }
8466 while (--fmtcnt >= 0) {
8467 switch (c = *fmt++) {
8468 case '-': flags |= F_LJUST; continue;
8469 case '+': flags |= F_SIGN; continue;
8470 case ' ': flags |= F_BLANK; continue;
8471 case '#': flags |= F_ALT; continue;
8472 case '0': flags |= F_ZERO; continue;
8473 }
8474 break;
8475 }
8476 if (c == '*') {
8477 v = getnextarg(args, arglen, &argidx);
8478 if (v == NULL)
8479 goto onError;
8480 if (!PyInt_Check(v)) {
8481 PyErr_SetString(PyExc_TypeError,
8482 "* wants int");
8483 goto onError;
8484 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008485 width = PyInt_AsSsize_t(v);
8486 if (width == -1 && PyErr_Occurred())
8487 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008488 if (width < 0) {
8489 flags |= F_LJUST;
8490 width = -width;
8491 }
8492 if (--fmtcnt >= 0)
8493 c = *fmt++;
8494 }
8495 else if (c >= '0' && c <= '9') {
8496 width = c - '0';
8497 while (--fmtcnt >= 0) {
8498 c = *fmt++;
8499 if (c < '0' || c > '9')
8500 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008501 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008502 PyErr_SetString(PyExc_ValueError,
8503 "width too big");
8504 goto onError;
8505 }
8506 width = width*10 + (c - '0');
8507 }
8508 }
8509 if (c == '.') {
8510 prec = 0;
8511 if (--fmtcnt >= 0)
8512 c = *fmt++;
8513 if (c == '*') {
8514 v = getnextarg(args, arglen, &argidx);
8515 if (v == NULL)
8516 goto onError;
8517 if (!PyInt_Check(v)) {
8518 PyErr_SetString(PyExc_TypeError,
8519 "* wants int");
8520 goto onError;
8521 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008522 prec = _PyInt_AsInt(v);
8523 if (prec == -1 && PyErr_Occurred())
8524 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008525 if (prec < 0)
8526 prec = 0;
8527 if (--fmtcnt >= 0)
8528 c = *fmt++;
8529 }
8530 else if (c >= '0' && c <= '9') {
8531 prec = c - '0';
8532 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008533 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008534 if (c < '0' || c > '9')
8535 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008536 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008537 PyErr_SetString(PyExc_ValueError,
8538 "prec too big");
8539 goto onError;
8540 }
8541 prec = prec*10 + (c - '0');
8542 }
8543 }
8544 } /* prec */
8545 if (fmtcnt >= 0) {
8546 if (c == 'h' || c == 'l' || c == 'L') {
8547 if (--fmtcnt >= 0)
8548 c = *fmt++;
8549 }
8550 }
8551 if (fmtcnt < 0) {
8552 PyErr_SetString(PyExc_ValueError,
8553 "incomplete format");
8554 goto onError;
8555 }
8556 if (c != '%') {
8557 v = getnextarg(args, arglen, &argidx);
8558 if (v == NULL)
8559 goto onError;
8560 }
8561 sign = 0;
8562 fill = ' ';
8563 switch (c) {
8564
8565 case '%':
8566 pbuf = formatbuf;
8567 /* presume that buffer length is at least 1 */
8568 pbuf[0] = '%';
8569 len = 1;
8570 break;
8571
8572 case 's':
8573 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008574 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008575 temp = v;
8576 Py_INCREF(temp);
8577 }
8578 else {
8579 PyObject *unicode;
8580 if (c == 's')
8581 temp = PyObject_Unicode(v);
8582 else
8583 temp = PyObject_Repr(v);
8584 if (temp == NULL)
8585 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008586 if (PyUnicode_Check(temp))
8587 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008588 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008589 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008590 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8591 PyString_GET_SIZE(temp),
8592 NULL,
8593 "strict");
8594 Py_DECREF(temp);
8595 temp = unicode;
8596 if (temp == NULL)
8597 goto onError;
8598 }
8599 else {
8600 Py_DECREF(temp);
8601 PyErr_SetString(PyExc_TypeError,
8602 "%s argument has non-string str()");
8603 goto onError;
8604 }
8605 }
8606 pbuf = PyUnicode_AS_UNICODE(temp);
8607 len = PyUnicode_GET_SIZE(temp);
8608 if (prec >= 0 && len > prec)
8609 len = prec;
8610 break;
8611
8612 case 'i':
8613 case 'd':
8614 case 'u':
8615 case 'o':
8616 case 'x':
8617 case 'X':
8618 if (c == 'i')
8619 c = 'd';
8620 isnumok = 0;
8621 if (PyNumber_Check(v)) {
8622 PyObject *iobj=NULL;
8623
8624 if (PyInt_Check(v) || (PyLong_Check(v))) {
8625 iobj = v;
8626 Py_INCREF(iobj);
8627 }
8628 else {
8629 iobj = PyNumber_Int(v);
8630 if (iobj==NULL) iobj = PyNumber_Long(v);
8631 }
8632 if (iobj!=NULL) {
8633 if (PyInt_Check(iobj)) {
8634 isnumok = 1;
8635 pbuf = formatbuf;
8636 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8637 flags, prec, c, iobj);
8638 Py_DECREF(iobj);
8639 if (len < 0)
8640 goto onError;
8641 sign = 1;
8642 }
8643 else if (PyLong_Check(iobj)) {
8644 isnumok = 1;
8645 temp = formatlong(iobj, flags, prec, c);
8646 Py_DECREF(iobj);
8647 if (!temp)
8648 goto onError;
8649 pbuf = PyUnicode_AS_UNICODE(temp);
8650 len = PyUnicode_GET_SIZE(temp);
8651 sign = 1;
8652 }
8653 else {
8654 Py_DECREF(iobj);
8655 }
8656 }
8657 }
8658 if (!isnumok) {
8659 PyErr_Format(PyExc_TypeError,
8660 "%%%c format: a number is required, "
8661 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8662 goto onError;
8663 }
8664 if (flags & F_ZERO)
8665 fill = '0';
8666 break;
8667
8668 case 'e':
8669 case 'E':
8670 case 'f':
8671 case 'F':
8672 case 'g':
8673 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008674 temp = formatfloat(v, flags, prec, c);
8675 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008676 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008677 pbuf = PyUnicode_AS_UNICODE(temp);
8678 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008679 sign = 1;
8680 if (flags & F_ZERO)
8681 fill = '0';
8682 break;
8683
8684 case 'c':
8685 pbuf = formatbuf;
8686 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8687 if (len < 0)
8688 goto onError;
8689 break;
8690
8691 default:
8692 PyErr_Format(PyExc_ValueError,
8693 "unsupported format character '%c' (0x%x) "
8694 "at index %zd",
8695 (31<=c && c<=126) ? (char)c : '?',
8696 (int)c,
8697 (Py_ssize_t)(fmt - 1 -
8698 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008699 goto onError;
8700 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008701 if (sign) {
8702 if (*pbuf == '-' || *pbuf == '+') {
8703 sign = *pbuf++;
8704 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008705 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008706 else if (flags & F_SIGN)
8707 sign = '+';
8708 else if (flags & F_BLANK)
8709 sign = ' ';
8710 else
8711 sign = 0;
8712 }
8713 if (width < len)
8714 width = len;
8715 if (rescnt - (sign != 0) < width) {
8716 reslen -= rescnt;
8717 rescnt = width + fmtcnt + 100;
8718 reslen += rescnt;
8719 if (reslen < 0) {
8720 Py_XDECREF(temp);
8721 PyErr_NoMemory();
8722 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008723 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008724 if (_PyUnicode_Resize(&result, reslen) < 0) {
8725 Py_XDECREF(temp);
8726 goto onError;
8727 }
8728 res = PyUnicode_AS_UNICODE(result)
8729 + reslen - rescnt;
8730 }
8731 if (sign) {
8732 if (fill != ' ')
8733 *res++ = sign;
8734 rescnt--;
8735 if (width > len)
8736 width--;
8737 }
8738 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8739 assert(pbuf[0] == '0');
8740 assert(pbuf[1] == c);
8741 if (fill != ' ') {
8742 *res++ = *pbuf++;
8743 *res++ = *pbuf++;
8744 }
8745 rescnt -= 2;
8746 width -= 2;
8747 if (width < 0)
8748 width = 0;
8749 len -= 2;
8750 }
8751 if (width > len && !(flags & F_LJUST)) {
8752 do {
8753 --rescnt;
8754 *res++ = fill;
8755 } while (--width > len);
8756 }
8757 if (fill == ' ') {
8758 if (sign)
8759 *res++ = sign;
8760 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8761 assert(pbuf[0] == '0');
8762 assert(pbuf[1] == c);
8763 *res++ = *pbuf++;
8764 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008765 }
8766 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008767 Py_UNICODE_COPY(res, pbuf, len);
8768 res += len;
8769 rescnt -= len;
8770 while (--width >= len) {
8771 --rescnt;
8772 *res++ = ' ';
8773 }
8774 if (dict && (argidx < arglen) && c != '%') {
8775 PyErr_SetString(PyExc_TypeError,
8776 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008777 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008778 goto onError;
8779 }
8780 Py_XDECREF(temp);
8781 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 } /* until end */
8783 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008784 PyErr_SetString(PyExc_TypeError,
8785 "not all arguments converted during string formatting");
8786 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 }
8788
Thomas Woutersa96affe2006-03-12 00:29:36 +00008789 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008790 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008792 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 }
8794 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795 return (PyObject *)result;
8796
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008797 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 Py_XDECREF(result);
8799 Py_DECREF(uformat);
8800 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008801 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802 }
8803 return NULL;
8804}
8805
8806static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008807 (readbufferproc) unicode_buffer_getreadbuf,
8808 (writebufferproc) unicode_buffer_getwritebuf,
8809 (segcountproc) unicode_buffer_getsegcount,
8810 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811};
8812
Jeremy Hylton938ace62002-07-17 16:30:39 +00008813static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008814unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8815
Tim Peters6d6c1a32001-08-02 04:15:00 +00008816static PyObject *
8817unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8818{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008819 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008820 static char *kwlist[] = {"string", "encoding", "errors", 0};
8821 char *encoding = NULL;
8822 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008823
Benjamin Peterson857ce152009-01-31 16:29:18 +00008824 if (type != &PyUnicode_Type)
8825 return unicode_subtype_new(type, args, kwds);
8826 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008827 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008828 return NULL;
8829 if (x == NULL)
8830 return (PyObject *)_PyUnicode_New(0);
8831 if (encoding == NULL && errors == NULL)
8832 return PyObject_Unicode(x);
8833 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008834 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008835}
8836
Guido van Rossume023fe02001-08-30 03:12:59 +00008837static PyObject *
8838unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8839{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008840 PyUnicodeObject *tmp, *pnew;
8841 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008842
Benjamin Peterson857ce152009-01-31 16:29:18 +00008843 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8844 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8845 if (tmp == NULL)
8846 return NULL;
8847 assert(PyUnicode_Check(tmp));
8848 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8849 if (pnew == NULL) {
8850 Py_DECREF(tmp);
8851 return NULL;
8852 }
8853 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8854 if (pnew->str == NULL) {
8855 _Py_ForgetReference((PyObject *)pnew);
8856 PyObject_Del(pnew);
8857 Py_DECREF(tmp);
8858 return PyErr_NoMemory();
8859 }
8860 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8861 pnew->length = n;
8862 pnew->hash = tmp->hash;
8863 Py_DECREF(tmp);
8864 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008865}
8866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008867PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008868 "unicode(object='') -> unicode object\n\
8869unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008870\n\
8871Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008872encoding defaults to the current default string encoding.\n\
8873errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008874
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008876 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008877 "unicode", /* tp_name */
8878 sizeof(PyUnicodeObject), /* tp_size */
8879 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008881 (destructor)unicode_dealloc, /* tp_dealloc */
8882 0, /* tp_print */
8883 0, /* tp_getattr */
8884 0, /* tp_setattr */
8885 0, /* tp_compare */
8886 unicode_repr, /* tp_repr */
8887 &unicode_as_number, /* tp_as_number */
8888 &unicode_as_sequence, /* tp_as_sequence */
8889 &unicode_as_mapping, /* tp_as_mapping */
8890 (hashfunc) unicode_hash, /* tp_hash*/
8891 0, /* tp_call*/
8892 (reprfunc) unicode_str, /* tp_str */
8893 PyObject_GenericGetAttr, /* tp_getattro */
8894 0, /* tp_setattro */
8895 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008896 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008897 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008898 unicode_doc, /* tp_doc */
8899 0, /* tp_traverse */
8900 0, /* tp_clear */
8901 PyUnicode_RichCompare, /* tp_richcompare */
8902 0, /* tp_weaklistoffset */
8903 0, /* tp_iter */
8904 0, /* tp_iternext */
8905 unicode_methods, /* tp_methods */
8906 0, /* tp_members */
8907 0, /* tp_getset */
8908 &PyBaseString_Type, /* tp_base */
8909 0, /* tp_dict */
8910 0, /* tp_descr_get */
8911 0, /* tp_descr_set */
8912 0, /* tp_dictoffset */
8913 0, /* tp_init */
8914 0, /* tp_alloc */
8915 unicode_new, /* tp_new */
8916 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917};
8918
8919/* Initialize the Unicode implementation */
8920
Thomas Wouters78890102000-07-22 19:25:51 +00008921void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008923 /* XXX - move this array to unicodectype.c ? */
8924 Py_UNICODE linebreak[] = {
8925 0x000A, /* LINE FEED */
8926 0x000D, /* CARRIAGE RETURN */
8927 0x001C, /* FILE SEPARATOR */
8928 0x001D, /* GROUP SEPARATOR */
8929 0x001E, /* RECORD SEPARATOR */
8930 0x0085, /* NEXT LINE */
8931 0x2028, /* LINE SEPARATOR */
8932 0x2029, /* PARAGRAPH SEPARATOR */
8933 };
8934
Fred Drakee4315f52000-05-09 19:53:39 +00008935 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008936 if (!unicode_empty) {
8937 unicode_empty = _PyUnicode_New(0);
8938 if (!unicode_empty)
8939 return;
8940 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008941
Guido van Rossumcacfc072002-05-24 19:01:59 +00008942 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008943 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008944
8945 /* initialize the linebreak bloom filter */
8946 bloom_linebreak = make_bloom_mask(
8947 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8948 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008949
8950 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008951
8952 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8953 Py_FatalError("Can't initialize field name iterator type");
8954
8955 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8956 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957}
8958
8959/* Finalize the Unicode implementation */
8960
Christian Heimes3b718a72008-02-14 12:47:33 +00008961int
8962PyUnicode_ClearFreeList(void)
8963{
8964 int freelist_size = numfree;
8965 PyUnicodeObject *u;
8966
8967 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008968 PyUnicodeObject *v = u;
8969 u = *(PyUnicodeObject **)u;
8970 if (v->str)
8971 PyObject_DEL(v->str);
8972 Py_XDECREF(v->defenc);
8973 PyObject_Del(v);
8974 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008975 }
8976 free_list = NULL;
8977 assert(numfree == 0);
8978 return freelist_size;
8979}
8980
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981void
Thomas Wouters78890102000-07-22 19:25:51 +00008982_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008984 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008986 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008987
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008988 for (i = 0; i < 256; i++)
8989 Py_CLEAR(unicode_latin1[i]);
8990
Christian Heimes3b718a72008-02-14 12:47:33 +00008991 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008993
Anthony Baxterac6bd462006-04-13 02:06:09 +00008994#ifdef __cplusplus
8995}
8996#endif