blob: 9368a3a14f594155ceea99b6afa41cd1c5980159 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Serhiy Storchakae822b032013-08-06 16:56:26 +0300550/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551 * by 'ptr', possibly combining surrogate pairs on narrow builds.
552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553 * that should be returned and 'end' pointing to the end of the buffer.
554 * ('end' is used on narrow builds to detect a lone surrogate at the
555 * end of the buffer that should be returned unchanged.)
556 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557 * The type of the returned char is always Py_UCS4.
558 *
559 * Note: the macro advances ptr to next char, so it might have side-effects
560 * (especially if used with other macros).
561 */
562
563/* helper macros used by _Py_UNICODE_NEXT */
564#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566/* Join two surrogate characters and return a single Py_UCS4 value. */
567#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
570
571#ifdef Py_UNICODE_WIDE
572#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
573#else
574#define _Py_UNICODE_NEXT(ptr, end) \
575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
578 (Py_UCS4)*(ptr)++)
579#endif
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581#ifdef HAVE_WCHAR_H
582
Mark Dickinson6b265f12009-03-18 16:07:26 +0000583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594{
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000643 PyErr_BadInternalCall();
644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 }
646
647 unicode = _PyUnicode_New(size);
648 if (!unicode)
649 return NULL;
650
651 /* Copy the wchar_t data into the new object */
652#ifdef HAVE_USABLE_WCHAR_T
653 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000654#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000656 register Py_UNICODE *u;
657 register Py_ssize_t i;
658 u = PyUnicode_AS_UNICODE(unicode);
659 for (i = size; i > 0; i--)
660 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662#endif
663
664 return (PyObject *)unicode;
665}
666
Mark Dickinson6b265f12009-03-18 16:07:26 +0000667#endif /* CONVERT_WCHAR_TO_SURROGATES */
668
669#undef CONVERT_WCHAR_TO_SURROGATES
670
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000671static void
672makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
673{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000674 *fmt++ = '%';
675 if (width) {
676 if (zeropad)
677 *fmt++ = '0';
678 fmt += sprintf(fmt, "%d", width);
679 }
680 if (precision)
681 fmt += sprintf(fmt, ".%d", precision);
682 if (longflag)
683 *fmt++ = 'l';
684 else if (size_tflag) {
685 char *f = PY_FORMAT_SIZE_T;
686 while (*f)
687 *fmt++ = *f++;
688 }
689 *fmt++ = c;
690 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000691}
692
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200693#define appendstring(string) \
694 do { \
695 for (copy = string;*copy; copy++) { \
696 *s++ = (unsigned char)*copy; \
697 } \
698 } while (0)
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000699
700PyObject *
701PyUnicode_FromFormatV(const char *format, va_list vargs)
702{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000703 va_list count;
704 Py_ssize_t callcount = 0;
705 PyObject **callresults = NULL;
706 PyObject **callresult = NULL;
707 Py_ssize_t n = 0;
708 int width = 0;
709 int precision = 0;
710 int zeropad;
711 const char* f;
712 Py_UNICODE *s;
713 PyObject *string;
714 /* used by sprintf */
715 char buffer[21];
716 /* use abuffer instead of buffer, if we need more space
717 * (which can happen if there's a format specifier with width). */
718 char *abuffer = NULL;
719 char *realbuffer;
720 Py_ssize_t abuffersize = 0;
721 char fmt[60]; /* should be enough for %0width.precisionld */
722 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000723
724#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726#else
727#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000728 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000729#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000730 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000731#endif
732#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000733 /* step 1: count the number of %S/%R/%s format specifications
734 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
735 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000736 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000737 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200738 f++;
739 while (*f && *f != '%' && !isalpha((unsigned)*f))
740 f++;
Serhiy Storchaka227526d2015-01-31 01:15:29 +0200741 if (!*f)
742 break;
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200743 if (*f == 's' || *f=='S' || *f=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000744 ++callcount;
745 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000746 }
747 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000748 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000749 if (callcount) {
750 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
751 if (!callresults) {
752 PyErr_NoMemory();
753 return NULL;
754 }
755 callresult = callresults;
756 }
757 /* step 3: figure out how large a buffer we need */
758 for (f = format; *f; f++) {
759 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200760 const char* p = f++;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000761 width = 0;
762 while (isdigit((unsigned)*f))
763 width = (width*10) + *f++ - '0';
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200764 precision = 0;
765 if (*f == '.') {
766 f++;
767 while (isdigit((unsigned)*f))
768 precision = (precision*10) + *f++ - '0';
769 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000770
Benjamin Peterson857ce152009-01-31 16:29:18 +0000771 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
772 * they don't affect the amount of space we reserve.
773 */
774 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000775 (f[1] == 'd' || f[1] == 'u'))
776 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000777
Benjamin Peterson857ce152009-01-31 16:29:18 +0000778 switch (*f) {
779 case 'c':
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300780 {
781 int ordinal = va_arg(count, int);
782#ifdef Py_UNICODE_WIDE
783 if (ordinal < 0 || ordinal > 0x10ffff) {
784 PyErr_SetString(PyExc_OverflowError,
785 "%c arg not in range(0x110000) "
786 "(wide Python build)");
787 goto fail;
788 }
789#else
790 if (ordinal < 0 || ordinal > 0xffff) {
791 PyErr_SetString(PyExc_OverflowError,
792 "%c arg not in range(0x10000) "
793 "(narrow Python build)");
794 goto fail;
795 }
796#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000797 /* fall through... */
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300798 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000799 case '%':
800 n++;
801 break;
802 case 'd': case 'u': case 'i': case 'x':
803 (void) va_arg(count, int);
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200804 if (width < precision)
805 width = precision;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000806 /* 20 bytes is enough to hold a 64-bit
807 integer. Decimal takes the most space.
808 This isn't enough for octal.
809 If a width is specified we need more
810 (which we allocate later). */
811 if (width < 20)
812 width = 20;
813 n += width;
814 if (abuffersize < width)
815 abuffersize = width;
816 break;
817 case 's':
818 {
819 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000820 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000821 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
822 if (!str)
823 goto fail;
824 n += PyUnicode_GET_SIZE(str);
825 /* Remember the str and switch to the next slot */
826 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000827 break;
828 }
829 case 'U':
830 {
831 PyObject *obj = va_arg(count, PyObject *);
832 assert(obj && PyUnicode_Check(obj));
833 n += PyUnicode_GET_SIZE(obj);
834 break;
835 }
836 case 'V':
837 {
838 PyObject *obj = va_arg(count, PyObject *);
839 const char *str = va_arg(count, const char *);
840 assert(obj || str);
841 assert(!obj || PyUnicode_Check(obj));
842 if (obj)
843 n += PyUnicode_GET_SIZE(obj);
844 else
845 n += strlen(str);
846 break;
847 }
848 case 'S':
849 {
850 PyObject *obj = va_arg(count, PyObject *);
851 PyObject *str;
852 assert(obj);
853 str = PyObject_Str(obj);
854 if (!str)
855 goto fail;
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200856 n += PyString_GET_SIZE(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000857 /* Remember the str and switch to the next slot */
858 *callresult++ = str;
859 break;
860 }
861 case 'R':
862 {
863 PyObject *obj = va_arg(count, PyObject *);
864 PyObject *repr;
865 assert(obj);
866 repr = PyObject_Repr(obj);
867 if (!repr)
868 goto fail;
869 n += PyUnicode_GET_SIZE(repr);
870 /* Remember the repr and switch to the next slot */
871 *callresult++ = repr;
872 break;
873 }
874 case 'p':
875 (void) va_arg(count, int);
876 /* maximum 64-bit pointer representation:
877 * 0xffffffffffffffff
878 * so 19 characters is enough.
879 * XXX I count 18 -- what's the extra for?
880 */
881 n += 19;
882 break;
883 default:
884 /* if we stumble upon an unknown
885 formatting code, copy the rest of
886 the format string to the output
887 string. (we cannot just skip the
888 code, since there's no way to know
889 what's in the argument list) */
890 n += strlen(p);
891 goto expand;
892 }
893 } else
894 n++;
895 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000896 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000897 if (abuffersize > 20) {
Serhiy Storchaka5ec0bbf2015-01-30 23:35:03 +0200898 /* add 1 for sprintf's trailing null byte */
899 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000900 if (!abuffer) {
901 PyErr_NoMemory();
902 goto fail;
903 }
904 realbuffer = abuffer;
905 }
906 else
907 realbuffer = buffer;
908 /* step 4: fill the buffer */
909 /* Since we've analyzed how much space we need for the worst case,
910 we don't have to resize the string.
911 There can be no errors beyond this point. */
912 string = PyUnicode_FromUnicode(NULL, n);
913 if (!string)
914 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000915
Benjamin Peterson857ce152009-01-31 16:29:18 +0000916 s = PyUnicode_AS_UNICODE(string);
917 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000918
Benjamin Peterson857ce152009-01-31 16:29:18 +0000919 for (f = format; *f; f++) {
920 if (*f == '%') {
921 const char* p = f++;
922 int longflag = 0;
923 int size_tflag = 0;
924 zeropad = (*f == '0');
925 /* parse the width.precision part */
926 width = 0;
927 while (isdigit((unsigned)*f))
928 width = (width*10) + *f++ - '0';
929 precision = 0;
930 if (*f == '.') {
931 f++;
932 while (isdigit((unsigned)*f))
933 precision = (precision*10) + *f++ - '0';
934 }
935 /* handle the long flag, but only for %ld and %lu.
936 others can be added when necessary. */
937 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
938 longflag = 1;
939 ++f;
940 }
941 /* handle the size_t flag. */
942 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
943 size_tflag = 1;
944 ++f;
945 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000946
Benjamin Peterson857ce152009-01-31 16:29:18 +0000947 switch (*f) {
948 case 'c':
949 *s++ = va_arg(vargs, int);
950 break;
951 case 'd':
952 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
953 if (longflag)
954 sprintf(realbuffer, fmt, va_arg(vargs, long));
955 else if (size_tflag)
956 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
957 else
958 sprintf(realbuffer, fmt, va_arg(vargs, int));
959 appendstring(realbuffer);
960 break;
961 case 'u':
962 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
963 if (longflag)
964 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
965 else if (size_tflag)
966 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
967 else
968 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
969 appendstring(realbuffer);
970 break;
971 case 'i':
972 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
973 sprintf(realbuffer, fmt, va_arg(vargs, int));
974 appendstring(realbuffer);
975 break;
976 case 'x':
977 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
978 sprintf(realbuffer, fmt, va_arg(vargs, int));
979 appendstring(realbuffer);
980 break;
981 case 's':
982 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000983 /* unused, since we already have the result */
984 (void) va_arg(vargs, char *);
985 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
986 PyUnicode_GET_SIZE(*callresult));
987 s += PyUnicode_GET_SIZE(*callresult);
988 /* We're done with the unicode()/repr() => forget it */
989 Py_DECREF(*callresult);
990 /* switch to next unicode()/repr() result */
991 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000992 break;
993 }
994 case 'U':
995 {
996 PyObject *obj = va_arg(vargs, PyObject *);
997 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
998 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
999 s += size;
1000 break;
1001 }
1002 case 'V':
1003 {
1004 PyObject *obj = va_arg(vargs, PyObject *);
1005 const char *str = va_arg(vargs, const char *);
1006 if (obj) {
1007 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1008 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1009 s += size;
1010 } else {
1011 appendstring(str);
1012 }
1013 break;
1014 }
1015 case 'S':
1016 case 'R':
1017 {
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001018 const char *str = PyString_AS_STRING(*callresult);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001019 /* unused, since we already have the result */
1020 (void) va_arg(vargs, PyObject *);
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001021 appendstring(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 /* We're done with the unicode()/repr() => forget it */
1023 Py_DECREF(*callresult);
1024 /* switch to next unicode()/repr() result */
1025 ++callresult;
1026 break;
1027 }
1028 case 'p':
1029 sprintf(buffer, "%p", va_arg(vargs, void*));
1030 /* %p is ill-defined: ensure leading 0x. */
1031 if (buffer[1] == 'X')
1032 buffer[1] = 'x';
1033 else if (buffer[1] != 'x') {
1034 memmove(buffer+2, buffer, strlen(buffer)+1);
1035 buffer[0] = '0';
1036 buffer[1] = 'x';
1037 }
1038 appendstring(buffer);
1039 break;
1040 case '%':
1041 *s++ = '%';
1042 break;
1043 default:
1044 appendstring(p);
1045 goto end;
1046 }
1047 } else
1048 *s++ = *f;
1049 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001050
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001051 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001052 if (callresults)
1053 PyObject_Free(callresults);
1054 if (abuffer)
1055 PyObject_Free(abuffer);
1056 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1057 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001058 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001059 if (callresults) {
1060 PyObject **callresult2 = callresults;
1061 while (callresult2 < callresult) {
1062 Py_DECREF(*callresult2);
1063 ++callresult2;
1064 }
1065 PyObject_Free(callresults);
1066 }
1067 if (abuffer)
1068 PyObject_Free(abuffer);
1069 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001070}
1071
1072#undef appendstring
1073
1074PyObject *
1075PyUnicode_FromFormat(const char *format, ...)
1076{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001077 PyObject* ret;
1078 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001079
1080#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001081 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001082#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001083 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001084#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001085 ret = PyUnicode_FromFormatV(format, vargs);
1086 va_end(vargs);
1087 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001088}
1089
Martin v. Löwis18e16552006-02-15 17:27:45 +00001090Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 wchar_t *w,
1092 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093{
1094 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001095 PyErr_BadInternalCall();
1096 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001098
1099 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001101 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001102
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103#ifdef HAVE_USABLE_WCHAR_T
1104 memcpy(w, unicode->str, size * sizeof(wchar_t));
1105#else
1106 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001107 register Py_UNICODE *u;
1108 register Py_ssize_t i;
1109 u = PyUnicode_AS_UNICODE(unicode);
1110 for (i = size; i > 0; i--)
1111 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 }
1113#endif
1114
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001115 if (size > PyUnicode_GET_SIZE(unicode))
1116 return PyUnicode_GET_SIZE(unicode);
1117 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001118 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119}
1120
1121#endif
1122
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001123PyObject *PyUnicode_FromOrdinal(int ordinal)
1124{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001125 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001126
1127#ifdef Py_UNICODE_WIDE
1128 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001129 PyErr_SetString(PyExc_ValueError,
1130 "unichr() arg not in range(0x110000) "
1131 "(wide Python build)");
1132 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001133 }
1134#else
1135 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001136 PyErr_SetString(PyExc_ValueError,
1137 "unichr() arg not in range(0x10000) "
1138 "(narrow Python build)");
1139 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001140 }
1141#endif
1142
Hye-Shik Chang40574832004-04-06 07:24:51 +00001143 s[0] = (Py_UNICODE)ordinal;
1144 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001145}
1146
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147PyObject *PyUnicode_FromObject(register PyObject *obj)
1148{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001149 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001150 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001151 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001152 Py_INCREF(obj);
1153 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001154 }
1155 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001156 /* For a Unicode subtype that's not a Unicode object,
1157 return a true Unicode object with the same data. */
1158 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1159 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001160 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001161 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1162}
1163
1164PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001165 const char *encoding,
1166 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001167{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001168 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001169 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001170 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001171
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001173 PyErr_BadInternalCall();
1174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001176
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001177#if 0
1178 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001179 that no encodings is given and then redirect to
1180 PyObject_Unicode() which then applies the additional logic for
1181 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001182
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001183 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001184 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001185
1186 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001187 if (PyUnicode_Check(obj)) {
1188 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001189 PyErr_SetString(PyExc_TypeError,
1190 "decoding Unicode is not supported");
1191 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001192 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001193 return PyObject_Unicode(obj);
1194 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001195#else
1196 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001197 PyErr_SetString(PyExc_TypeError,
1198 "decoding Unicode is not supported");
1199 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001200 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001201#endif
1202
1203 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001204 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001205 s = PyString_AS_STRING(obj);
1206 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001207 }
Christian Heimes3497f942008-05-26 12:29:14 +00001208 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001209 /* Python 2.x specific */
1210 PyErr_Format(PyExc_TypeError,
1211 "decoding bytearray is not supported");
1212 return NULL;
1213 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001214 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001215 /* Overwrite the error message with something more useful in
1216 case of a TypeError. */
1217 if (PyErr_ExceptionMatches(PyExc_TypeError))
1218 PyErr_Format(PyExc_TypeError,
1219 "coercing to Unicode: need string or buffer, "
1220 "%.80s found",
1221 Py_TYPE(obj)->tp_name);
1222 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001223 }
Tim Petersced69f82003-09-16 20:30:58 +00001224
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001225 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001226 if (len == 0)
1227 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001228
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001229 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001230 return v;
1231
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001232 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001233 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234}
1235
1236PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001237 Py_ssize_t size,
1238 const char *encoding,
1239 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240{
1241 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001242
1243 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001244 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001245
1246 /* Shortcuts for common default encodings */
1247 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001249 else if (strcmp(encoding, "latin-1") == 0)
1250 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001251#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1252 else if (strcmp(encoding, "mbcs") == 0)
1253 return PyUnicode_DecodeMBCS(s, size, errors);
1254#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001255 else if (strcmp(encoding, "ascii") == 0)
1256 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001257
1258 /* Decode via the codec registry */
1259 buffer = PyBuffer_FromMemory((void *)s, size);
1260 if (buffer == NULL)
1261 goto onError;
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001262 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 if (unicode == NULL)
1264 goto onError;
1265 if (!PyUnicode_Check(unicode)) {
1266 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001267 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001268 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269 Py_DECREF(unicode);
1270 goto onError;
1271 }
1272 Py_DECREF(buffer);
1273 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001274
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001275 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 Py_XDECREF(buffer);
1277 return NULL;
1278}
1279
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001280PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1281 const char *encoding,
1282 const char *errors)
1283{
1284 PyObject *v;
1285
1286 if (!PyUnicode_Check(unicode)) {
1287 PyErr_BadArgument();
1288 goto onError;
1289 }
1290
1291 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001292 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001293
1294 /* Decode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001295 v = _PyCodec_DecodeText(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001296 if (v == NULL)
1297 goto onError;
1298 return v;
1299
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001300 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001301 return NULL;
1302}
1303
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001305 Py_ssize_t size,
1306 const char *encoding,
1307 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308{
1309 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001310
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 unicode = PyUnicode_FromUnicode(s, size);
1312 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1315 Py_DECREF(unicode);
1316 return v;
1317}
1318
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001319PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1320 const char *encoding,
1321 const char *errors)
1322{
1323 PyObject *v;
1324
1325 if (!PyUnicode_Check(unicode)) {
1326 PyErr_BadArgument();
1327 goto onError;
1328 }
1329
1330 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001331 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001332
1333 /* Encode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001334 v = _PyCodec_EncodeText(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001335 if (v == NULL)
1336 goto onError;
1337 return v;
1338
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001339 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001340 return NULL;
1341}
1342
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1344 const char *encoding,
1345 const char *errors)
1346{
1347 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001348
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 if (!PyUnicode_Check(unicode)) {
1350 PyErr_BadArgument();
1351 goto onError;
1352 }
Fred Drakee4315f52000-05-09 19:53:39 +00001353
Tim Petersced69f82003-09-16 20:30:58 +00001354 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001355 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001356
1357 /* Shortcuts for common default encodings */
1358 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001359 if (strcmp(encoding, "utf-8") == 0)
1360 return PyUnicode_AsUTF8String(unicode);
1361 else if (strcmp(encoding, "latin-1") == 0)
1362 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001363#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001364 else if (strcmp(encoding, "mbcs") == 0)
1365 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001366#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001367 else if (strcmp(encoding, "ascii") == 0)
1368 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370
1371 /* Encode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001372 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373 if (v == NULL)
1374 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001375 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001377 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001378 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379 Py_DECREF(v);
1380 goto onError;
1381 }
1382 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001383
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001384 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385 return NULL;
1386}
1387
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001388PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001390{
1391 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1392
1393 if (v)
1394 return v;
1395 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1396 if (v && errors == NULL)
1397 ((PyUnicodeObject *)unicode)->defenc = v;
1398 return v;
1399}
1400
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1402{
1403 if (!PyUnicode_Check(unicode)) {
1404 PyErr_BadArgument();
1405 goto onError;
1406 }
1407 return PyUnicode_AS_UNICODE(unicode);
1408
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001409 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410 return NULL;
1411}
1412
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414{
1415 if (!PyUnicode_Check(unicode)) {
1416 PyErr_BadArgument();
1417 goto onError;
1418 }
1419 return PyUnicode_GET_SIZE(unicode);
1420
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001421 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422 return -1;
1423}
1424
Thomas Wouters78890102000-07-22 19:25:51 +00001425const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001426{
1427 return unicode_default_encoding;
1428}
1429
1430int PyUnicode_SetDefaultEncoding(const char *encoding)
1431{
1432 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001433
Fred Drakee4315f52000-05-09 19:53:39 +00001434 /* Make sure the encoding is valid. As side effect, this also
1435 loads the encoding into the codec registry cache. */
1436 v = _PyCodec_Lookup(encoding);
1437 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001438 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001439 Py_DECREF(v);
1440 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001442 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001443 return 0;
1444
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001445 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001446 return -1;
1447}
1448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449/* error handling callback helper:
1450 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001451 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452 and adjust various state variables.
1453 return 0 on success, -1 on error
1454*/
1455
1456static
1457int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001458 const char *encoding, const char *reason,
1459 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1460 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1461 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001462{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001463 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001464
1465 PyObject *restuple = NULL;
1466 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1468 Py_ssize_t requiredsize;
1469 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001470 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001471 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 int res = -1;
1473
1474 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001475 *errorHandler = PyCodec_LookupError(errors);
1476 if (*errorHandler == NULL)
1477 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001478 }
1479
1480 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001481 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001482 encoding, input, insize, *startinpos, *endinpos, reason);
1483 if (*exceptionObject == NULL)
1484 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001485 }
1486 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001487 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1488 goto onError;
1489 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1490 goto onError;
1491 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1492 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001493 }
1494
1495 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1496 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001497 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001499 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001500 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001501 }
1502 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001503 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001505 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001506 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001507 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1508 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001509 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001510
1511 /* need more space? (at least enough for what we
1512 have+the replacement+the rest of the string (starting
1513 at the new input position), so we won't have to check space
1514 when there are no errors in the rest of the string) */
1515 repptr = PyUnicode_AS_UNICODE(repunicode);
1516 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001517 requiredsize = *outpos;
1518 if (requiredsize > PY_SSIZE_T_MAX - repsize)
1519 goto overflow;
1520 requiredsize += repsize;
1521 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1522 goto overflow;
1523 requiredsize += insize - newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 if (requiredsize > outsize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001525 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001526 requiredsize = 2*outsize;
1527 if (_PyUnicode_Resize(output, requiredsize) < 0)
1528 goto onError;
1529 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 }
1531 *endinpos = newpos;
1532 *inptr = input + newpos;
1533 Py_UNICODE_COPY(*outptr, repptr, repsize);
1534 *outptr += repsize;
1535 *outpos += repsize;
1536 /* we made it! */
1537 res = 0;
1538
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001539 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 Py_XDECREF(restuple);
1541 return res;
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001542
1543 overflow:
1544 PyErr_SetString(PyExc_OverflowError,
1545 "decoded result is too long for a Python string");
1546 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547}
1548
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549/* --- UTF-7 Codec -------------------------------------------------------- */
1550
Antoine Pitrou653dece2009-05-04 18:32:32 +00001551/* See RFC2152 for details. We encode conservatively and decode liberally. */
1552
1553/* Three simple macros defining base-64. */
1554
1555/* Is c a base-64 character? */
1556
1557#define IS_BASE64(c) \
Serhiy Storchaka462502b2015-10-10 09:33:11 +03001558 (((c) >= 'A' && (c) <= 'Z') || \
1559 ((c) >= 'a' && (c) <= 'z') || \
1560 ((c) >= '0' && (c) <= '9') || \
1561 (c) == '+' || (c) == '/')
Antoine Pitrou653dece2009-05-04 18:32:32 +00001562
1563/* given that c is a base-64 character, what is its base-64 value? */
1564
1565#define FROM_BASE64(c) \
1566 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1567 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1568 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1569 (c) == '+' ? 62 : 63)
1570
1571/* What is the base-64 character of the bottom 6 bits of n? */
1572
1573#define TO_BASE64(n) \
1574 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1575
1576/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1577 * decoded as itself. We are permissive on decoding; the only ASCII
1578 * byte not decoding to itself is the + which begins a base64
1579 * string. */
1580
1581#define DECODE_DIRECT(c) \
1582 ((c) <= 127 && (c) != '+')
1583
1584/* The UTF-7 encoder treats ASCII characters differently according to
1585 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1586 * the above). See RFC2152. This array identifies these different
1587 * sets:
1588 * 0 : "Set D"
1589 * alphanumeric and '(),-./:?
1590 * 1 : "Set O"
1591 * !"#$%&*;<=>@[]^_`{|}
1592 * 2 : "whitespace"
1593 * ht nl cr sp
1594 * 3 : special (must be base64 encoded)
1595 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1596 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001597
Tim Petersced69f82003-09-16 20:30:58 +00001598static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001599char utf7_category[128] = {
1600/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1601 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1602/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1603 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1604/* sp ! " # $ % & ' ( ) * + , - . / */
1605 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1606/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1607 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1608/* @ A B C D E F G H I J K L M N O */
1609 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1610/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1611 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1612/* ` a b c d e f g h i j k l m n o */
1613 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1614/* p q r s t u v w x y z { | } ~ del */
1615 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001616};
1617
Antoine Pitrou653dece2009-05-04 18:32:32 +00001618/* ENCODE_DIRECT: this character should be encoded as itself. The
1619 * answer depends on whether we are encoding set O as itself, and also
1620 * on whether we are encoding whitespace as itself. RFC2152 makes it
1621 * clear that the answers to these questions vary between
1622 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001623
Antoine Pitrou653dece2009-05-04 18:32:32 +00001624#define ENCODE_DIRECT(c, directO, directWS) \
1625 ((c) < 128 && (c) > 0 && \
1626 ((utf7_category[(c)] == 0) || \
1627 (directWS && (utf7_category[(c)] == 2)) || \
1628 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001629
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001631 Py_ssize_t size,
1632 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001633{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001634 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1635}
1636
Antoine Pitrou653dece2009-05-04 18:32:32 +00001637/* The decoder. The only state we preserve is our read position,
1638 * i.e. how many characters we have consumed. So if we end in the
1639 * middle of a shift sequence we have to back off the read position
1640 * and the output to the beginning of the sequence, otherwise we lose
1641 * all the shift state (seen bits, number of bits seen, high
1642 * surrogate). */
1643
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001644PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001645 Py_ssize_t size,
1646 const char *errors,
1647 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001648{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001650 Py_ssize_t startinpos;
1651 Py_ssize_t endinpos;
1652 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 const char *e;
1654 PyUnicodeObject *unicode;
1655 Py_UNICODE *p;
1656 const char *errmsg = "";
1657 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001658 Py_UNICODE *shiftOutStart;
1659 unsigned int base64bits = 0;
1660 unsigned long base64buffer = 0;
1661 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001662 PyObject *errorHandler = NULL;
1663 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664
1665 unicode = _PyUnicode_New(size);
1666 if (!unicode)
1667 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001668 if (size == 0) {
1669 if (consumed)
1670 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001671 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001672 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673
1674 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001675 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676 e = s + size;
1677
1678 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001679 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001680
Antoine Pitrou653dece2009-05-04 18:32:32 +00001681 if (inShift) { /* in a base-64 section */
1682 if (IS_BASE64(ch)) { /* consume a base-64 character */
1683 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1684 base64bits += 6;
1685 s++;
1686 if (base64bits >= 16) {
1687 /* we have enough bits for a UTF-16 value */
1688 Py_UNICODE outCh = (Py_UNICODE)
1689 (base64buffer >> (base64bits-16));
1690 base64bits -= 16;
1691 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001692 assert(outCh <= 0xffff);
Antoine Pitrou653dece2009-05-04 18:32:32 +00001693 if (surrogate) {
1694 /* expecting a second surrogate */
1695 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1696#ifdef Py_UNICODE_WIDE
1697 *p++ = (((surrogate & 0x3FF)<<10)
1698 | (outCh & 0x3FF)) + 0x10000;
1699#else
1700 *p++ = surrogate;
1701 *p++ = outCh;
1702#endif
1703 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001704 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705 }
1706 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001707 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001708 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001709 }
1710 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001711 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001712 /* first surrogate */
1713 surrogate = outCh;
1714 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001715 else {
1716 *p++ = outCh;
1717 }
1718 }
1719 }
1720 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001721 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001722 if (base64bits > 0) { /* left-over bits */
1723 if (base64bits >= 6) {
1724 /* We've seen at least one base-64 character */
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001725 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001726 errmsg = "partial character in shift sequence";
1727 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001728 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001729 else {
1730 /* Some bits remain; they should be zero */
1731 if (base64buffer != 0) {
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001732 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733 errmsg = "non-zero padding bits in shift sequence";
1734 goto utf7Error;
1735 }
1736 }
1737 }
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001738 if (surrogate && DECODE_DIRECT(ch))
1739 *p++ = surrogate;
1740 surrogate = 0;
1741 if (ch == '-') {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001742 /* '-' is absorbed; other terminating
1743 characters are preserved */
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001744 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001745 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 }
1747 }
1748 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001749 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001750 s++; /* consume '+' */
1751 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001752 s++;
1753 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001754 }
1755 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001756 inShift = 1;
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001757 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001758 shiftOutStart = p;
1759 base64bits = 0;
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001760 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001761 }
1762 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001763 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001764 *p++ = ch;
1765 s++;
1766 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001767 else {
1768 startinpos = s-starts;
1769 s++;
1770 errmsg = "unexpected special character";
1771 goto utf7Error;
1772 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001774utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 outpos = p-PyUnicode_AS_UNICODE(unicode);
1776 endinpos = s-starts;
1777 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001778 errors, &errorHandler,
1779 "utf7", errmsg,
1780 starts, size, &startinpos, &endinpos, &exc, &s,
1781 &unicode, &outpos, &p))
1782 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783 }
1784
Antoine Pitrou653dece2009-05-04 18:32:32 +00001785 /* end of string */
1786
1787 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1788 /* if we're in an inconsistent state, that's an error */
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001789 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001790 if (surrogate ||
1791 (base64bits >= 6) ||
1792 (base64bits > 0 && base64buffer != 0)) {
1793 outpos = p-PyUnicode_AS_UNICODE(unicode);
1794 endinpos = size;
1795 if (unicode_decode_call_errorhandler(
1796 errors, &errorHandler,
1797 "utf7", "unterminated shift sequence",
1798 starts, size, &startinpos, &endinpos, &exc, &s,
1799 &unicode, &outpos, &p))
1800 goto onError;
1801 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001802 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001803
1804 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001805 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001806 if (inShift) {
1807 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001808 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001809 }
1810 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001811 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001812 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001813 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001814
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001815 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001816 goto onError;
1817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 Py_XDECREF(errorHandler);
1819 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001820 return (PyObject *)unicode;
1821
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001822 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 Py_XDECREF(errorHandler);
1824 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001825 Py_DECREF(unicode);
1826 return NULL;
1827}
1828
1829
1830PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001831 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001832 int base64SetO,
1833 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001834 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001835{
1836 PyObject *v;
1837 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001838 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001839 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001840 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001841 unsigned int base64bits = 0;
1842 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001843 char * out;
1844 char * start;
1845
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001846 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001847 return PyErr_NoMemory();
1848
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001849 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001850 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001851
Antoine Pitrou653dece2009-05-04 18:32:32 +00001852 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 if (v == NULL)
1854 return NULL;
1855
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001856 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001857 for (;i < size; ++i) {
1858 Py_UNICODE ch = s[i];
1859
Antoine Pitrou653dece2009-05-04 18:32:32 +00001860 if (inShift) {
1861 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1862 /* shifting out */
1863 if (base64bits) { /* output remaining bits */
1864 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1865 base64buffer = 0;
1866 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001867 }
1868 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001869 /* Characters not in the BASE64 set implicitly unshift the sequence
1870 so no '-' is required, except if the character is itself a '-' */
1871 if (IS_BASE64(ch) || ch == '-') {
1872 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001873 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001874 *out++ = (char) ch;
1875 }
1876 else {
1877 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001878 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001879 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001880 else { /* not in a shift sequence */
1881 if (ch == '+') {
1882 *out++ = '+';
1883 *out++ = '-';
1884 }
1885 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1886 *out++ = (char) ch;
1887 }
1888 else {
1889 *out++ = '+';
1890 inShift = 1;
1891 goto encode_char;
1892 }
1893 }
1894 continue;
1895encode_char:
1896#ifdef Py_UNICODE_WIDE
1897 if (ch >= 0x10000) {
1898 /* code first surrogate */
1899 base64bits += 16;
1900 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1901 while (base64bits >= 6) {
1902 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1903 base64bits -= 6;
1904 }
1905 /* prepare second surrogate */
1906 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1907 }
1908#endif
1909 base64bits += 16;
1910 base64buffer = (base64buffer << 16) | ch;
1911 while (base64bits >= 6) {
1912 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1913 base64bits -= 6;
1914 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001915 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001916 if (base64bits)
1917 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1918 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001919 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001920
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001921 if (_PyString_Resize(&v, out - start))
1922 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001923 return v;
1924}
1925
Antoine Pitrou653dece2009-05-04 18:32:32 +00001926#undef IS_BASE64
1927#undef FROM_BASE64
1928#undef TO_BASE64
1929#undef DECODE_DIRECT
1930#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001931
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932/* --- UTF-8 Codec -------------------------------------------------------- */
1933
Tim Petersced69f82003-09-16 20:30:58 +00001934static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001936 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1937 illegal prefix. See RFC 3629 for details */
1938 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1939 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1943 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1944 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001945 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1946 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1948 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001949 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1950 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1951 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1952 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1953 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954};
1955
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001957 Py_ssize_t size,
1958 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959{
Walter Dörwald69652032004-09-07 20:24:22 +00001960 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1961}
1962
1963PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001964 Py_ssize_t size,
1965 const char *errors,
1966 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001967{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001970 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001971 Py_ssize_t startinpos;
1972 Py_ssize_t endinpos;
1973 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 const char *e;
1975 PyUnicodeObject *unicode;
1976 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001977 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001978 PyObject *errorHandler = NULL;
1979 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980
1981 /* Note: size will always be longer than the resulting Unicode
1982 character count */
1983 unicode = _PyUnicode_New(size);
1984 if (!unicode)
1985 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001986 if (size == 0) {
1987 if (consumed)
1988 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991
1992 /* Unpack UTF-8 encoded data */
1993 p = unicode->str;
1994 e = s + size;
1995
1996 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001997 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998
1999 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002000 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001 s++;
2002 continue;
2003 }
2004
2005 n = utf8_code_length[ch];
2006
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002007 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002008 if (consumed)
2009 break;
2010 else {
2011 errmsg = "unexpected end of data";
2012 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002013 endinpos = startinpos+1;
2014 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2015 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002016 goto utf8Error;
2017 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019
2020 switch (n) {
2021
2022 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002023 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002024 startinpos = s-starts;
2025 endinpos = startinpos+1;
2026 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027
2028 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002029 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002030 startinpos = s-starts;
2031 endinpos = startinpos+1;
2032 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
2034 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002035 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00002036 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002037 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002038 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002039 goto utf8Error;
2040 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002042 assert ((ch > 0x007F) && (ch <= 0x07FF));
2043 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 break;
2045
2046 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002047 /* XXX: surrogates shouldn't be valid UTF-8!
2048 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2049 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2050 Uncomment the 2 lines below to make them invalid,
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002051 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00002052 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002053 (s[2] & 0xc0) != 0x80 ||
2054 ((unsigned char)s[0] == 0xE0 &&
2055 (unsigned char)s[1] < 0xA0)/* ||
2056 ((unsigned char)s[0] == 0xED &&
2057 (unsigned char)s[1] > 0x9F)*/) {
2058 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002059 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002060 endinpos = startinpos + 1;
2061
2062 /* if s[1] first two bits are 1 and 0, then the invalid
2063 continuation byte is s[2], so increment endinpos by 1,
2064 if not, s[1] is invalid and endinpos doesn't need to
2065 be incremented. */
2066 if ((s[1] & 0xC0) == 0x80)
2067 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002068 goto utf8Error;
2069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002071 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2072 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002073 break;
2074
2075 case 4:
2076 if ((s[1] & 0xc0) != 0x80 ||
2077 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002078 (s[3] & 0xc0) != 0x80 ||
2079 ((unsigned char)s[0] == 0xF0 &&
2080 (unsigned char)s[1] < 0x90) ||
2081 ((unsigned char)s[0] == 0xF4 &&
2082 (unsigned char)s[1] > 0x8F)) {
2083 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002084 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002085 endinpos = startinpos + 1;
2086 if ((s[1] & 0xC0) == 0x80) {
2087 endinpos++;
2088 if ((s[2] & 0xC0) == 0x80)
2089 endinpos++;
2090 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002091 goto utf8Error;
2092 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002093 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002094 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2095 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2096
Fredrik Lundh8f455852001-06-27 18:59:43 +00002097#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002098 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002099#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002100 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002101
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002102 /* translate from 10000..10FFFF to 0..FFFF */
2103 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002104
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002105 /* high surrogate = top 10 bits added to D800 */
2106 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002107
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002108 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002109 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002110#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 }
2113 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002114 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002115
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002116 utf8Error:
2117 outpos = p-PyUnicode_AS_UNICODE(unicode);
2118 if (unicode_decode_call_errorhandler(
2119 errors, &errorHandler,
2120 "utf8", errmsg,
2121 starts, size, &startinpos, &endinpos, &exc, &s,
2122 &unicode, &outpos, &p))
2123 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 }
Walter Dörwald69652032004-09-07 20:24:22 +00002125 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002126 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127
2128 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002129 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 goto onError;
2131
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002132 Py_XDECREF(errorHandler);
2133 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 return (PyObject *)unicode;
2135
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002136 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 Py_XDECREF(errorHandler);
2138 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139 Py_DECREF(unicode);
2140 return NULL;
2141}
2142
Tim Peters602f7402002-04-27 18:03:26 +00002143/* Allocation strategy: if the string is short, convert into a stack buffer
2144 and allocate exactly as much space needed at the end. Else allocate the
2145 maximum possible needed (4 result bytes per Unicode character), and return
2146 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002147*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002148PyObject *
2149PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002150 Py_ssize_t size,
2151 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152{
Tim Peters602f7402002-04-27 18:03:26 +00002153#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002154
Martin v. Löwis18e16552006-02-15 17:27:45 +00002155 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002156 PyObject *v; /* result string object */
2157 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002158 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002159 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002160 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002161
Tim Peters602f7402002-04-27 18:03:26 +00002162 assert(s != NULL);
2163 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164
Tim Peters602f7402002-04-27 18:03:26 +00002165 if (size <= MAX_SHORT_UNICHARS) {
2166 /* Write into the stack buffer; nallocated can't overflow.
2167 * At the end, we'll allocate exactly as much heap space as it
2168 * turns out we need.
2169 */
2170 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2171 v = NULL; /* will allocate after we're done */
2172 p = stackbuf;
2173 }
2174 else {
2175 /* Overallocate on the heap, and give the excess back at the end. */
2176 nallocated = size * 4;
2177 if (nallocated / 4 != size) /* overflow! */
2178 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002179 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002180 if (v == NULL)
2181 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002182 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002183 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002184
Tim Peters602f7402002-04-27 18:03:26 +00002185 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002186 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002187
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002188 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002189 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002191
Guido van Rossumd57fd912000-03-10 22:53:23 +00002192 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002193 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002194 *p++ = (char)(0xc0 | (ch >> 6));
2195 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002196 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002197 else {
Tim Peters602f7402002-04-27 18:03:26 +00002198 /* Encode UCS2 Unicode ordinals */
2199 if (ch < 0x10000) {
2200 /* Special case: check for high surrogate */
2201 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2202 Py_UCS4 ch2 = s[i];
2203 /* Check for low surrogate and combine the two to
2204 form a UCS4 value */
2205 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002206 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002207 i++;
2208 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002209 }
Tim Peters602f7402002-04-27 18:03:26 +00002210 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002211 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002212 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002213 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2214 *p++ = (char)(0x80 | (ch & 0x3f));
2215 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002216 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002217 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002218 /* Encode UCS4 Unicode ordinals */
2219 *p++ = (char)(0xf0 | (ch >> 18));
2220 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2221 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2222 *p++ = (char)(0x80 | (ch & 0x3f));
2223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002225
Tim Peters602f7402002-04-27 18:03:26 +00002226 if (v == NULL) {
2227 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002228 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002229 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002230 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002231 }
2232 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002233 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002234 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002235 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002236 if (_PyString_Resize(&v, nneeded))
2237 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002240
Tim Peters602f7402002-04-27 18:03:26 +00002241#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242}
2243
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2245{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246 if (!PyUnicode_Check(unicode)) {
2247 PyErr_BadArgument();
2248 return NULL;
2249 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002250 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002251 PyUnicode_GET_SIZE(unicode),
2252 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002253}
2254
Walter Dörwald6e390802007-08-17 16:41:28 +00002255/* --- UTF-32 Codec ------------------------------------------------------- */
2256
2257PyObject *
2258PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002259 Py_ssize_t size,
2260 const char *errors,
2261 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002262{
2263 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2264}
2265
2266PyObject *
2267PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002268 Py_ssize_t size,
2269 const char *errors,
2270 int *byteorder,
2271 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002272{
2273 const char *starts = s;
2274 Py_ssize_t startinpos;
2275 Py_ssize_t endinpos;
2276 Py_ssize_t outpos;
2277 PyUnicodeObject *unicode;
2278 Py_UNICODE *p;
2279#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002280 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002281 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002282#else
2283 const int pairs = 0;
2284#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002285 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002286 int bo = 0; /* assume native ordering by default */
2287 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002288 /* Offsets from q for retrieving bytes in the right order. */
2289#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2290 int iorder[] = {0, 1, 2, 3};
2291#else
2292 int iorder[] = {3, 2, 1, 0};
2293#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002294 PyObject *errorHandler = NULL;
2295 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002296
Walter Dörwald6e390802007-08-17 16:41:28 +00002297 q = (unsigned char *)s;
2298 e = q + size;
2299
2300 if (byteorder)
2301 bo = *byteorder;
2302
2303 /* Check for BOM marks (U+FEFF) in the input and adjust current
2304 byte order setting accordingly. In native mode, the leading BOM
2305 mark is skipped, in all other modes, it is copied to the output
2306 stream as-is (giving a ZWNBSP character). */
2307 if (bo == 0) {
2308 if (size >= 4) {
2309 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002310 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002311#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002312 if (bom == 0x0000FEFF) {
2313 q += 4;
2314 bo = -1;
2315 }
2316 else if (bom == 0xFFFE0000) {
2317 q += 4;
2318 bo = 1;
2319 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002320#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002321 if (bom == 0x0000FEFF) {
2322 q += 4;
2323 bo = 1;
2324 }
2325 else if (bom == 0xFFFE0000) {
2326 q += 4;
2327 bo = -1;
2328 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002329#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002330 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002331 }
2332
2333 if (bo == -1) {
2334 /* force LE */
2335 iorder[0] = 0;
2336 iorder[1] = 1;
2337 iorder[2] = 2;
2338 iorder[3] = 3;
2339 }
2340 else if (bo == 1) {
2341 /* force BE */
2342 iorder[0] = 3;
2343 iorder[1] = 2;
2344 iorder[2] = 1;
2345 iorder[3] = 0;
2346 }
2347
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002348 /* On narrow builds we split characters outside the BMP into two
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002349 code points => count how much extra space we need. */
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002350#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002351 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002352 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2353 pairs++;
2354#endif
2355
2356 /* This might be one to much, because of a BOM */
2357 unicode = _PyUnicode_New((size+3)/4+pairs);
2358 if (!unicode)
2359 return NULL;
2360 if (size == 0)
2361 return (PyObject *)unicode;
2362
2363 /* Unpack UTF-32 encoded data */
2364 p = unicode->str;
2365
Walter Dörwald6e390802007-08-17 16:41:28 +00002366 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002367 Py_UCS4 ch;
2368 /* remaining bytes at the end? (size should be divisible by 4) */
2369 if (e-q<4) {
2370 if (consumed)
2371 break;
2372 errmsg = "truncated data";
2373 startinpos = ((const char *)q)-starts;
2374 endinpos = ((const char *)e)-starts;
2375 goto utf32Error;
2376 /* The remaining input chars are ignored if the callback
2377 chooses to skip the input */
2378 }
2379 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2380 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002381
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002382 if (ch >= 0x110000)
2383 {
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002384 errmsg = "code point not in range(0x110000)";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002385 startinpos = ((const char *)q)-starts;
2386 endinpos = startinpos+4;
2387 goto utf32Error;
2388 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002389#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002390 if (ch >= 0x10000)
2391 {
2392 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2393 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2394 }
2395 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002396#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002397 *p++ = ch;
2398 q += 4;
2399 continue;
2400 utf32Error:
2401 outpos = p-PyUnicode_AS_UNICODE(unicode);
2402 if (unicode_decode_call_errorhandler(
2403 errors, &errorHandler,
2404 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002405 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002406 &unicode, &outpos, &p))
2407 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002408 }
2409
2410 if (byteorder)
2411 *byteorder = bo;
2412
2413 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002414 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002415
2416 /* Adjust length */
2417 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2418 goto onError;
2419
2420 Py_XDECREF(errorHandler);
2421 Py_XDECREF(exc);
2422 return (PyObject *)unicode;
2423
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002424 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002425 Py_DECREF(unicode);
2426 Py_XDECREF(errorHandler);
2427 Py_XDECREF(exc);
2428 return NULL;
2429}
2430
2431PyObject *
2432PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002433 Py_ssize_t size,
2434 const char *errors,
2435 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002436{
2437 PyObject *v;
2438 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002439 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002440#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002441 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002442#else
2443 const int pairs = 0;
2444#endif
2445 /* Offsets from p for storing byte pairs in the right order. */
2446#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2447 int iorder[] = {0, 1, 2, 3};
2448#else
2449 int iorder[] = {3, 2, 1, 0};
2450#endif
2451
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002452#define STORECHAR(CH) \
2453 do { \
2454 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2455 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2456 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2457 p[iorder[0]] = (CH) & 0xff; \
2458 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002459 } while(0)
2460
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002461 /* In narrow builds we can output surrogate pairs as one code point,
Walter Dörwald6e390802007-08-17 16:41:28 +00002462 so we need less space. */
2463#ifndef Py_UNICODE_WIDE
2464 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002465 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2466 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2467 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002468#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002469 nsize = (size - pairs + (byteorder == 0));
2470 bytesize = nsize * 4;
2471 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002472 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002473 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002474 if (v == NULL)
2475 return NULL;
2476
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002477 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002478 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002479 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002480 if (size == 0)
2481 return v;
2482
2483 if (byteorder == -1) {
2484 /* force LE */
2485 iorder[0] = 0;
2486 iorder[1] = 1;
2487 iorder[2] = 2;
2488 iorder[3] = 3;
2489 }
2490 else if (byteorder == 1) {
2491 /* force BE */
2492 iorder[0] = 3;
2493 iorder[1] = 2;
2494 iorder[2] = 1;
2495 iorder[3] = 0;
2496 }
2497
2498 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002499 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002500#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002501 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2502 Py_UCS4 ch2 = *s;
2503 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2504 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2505 s++;
2506 size--;
2507 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002508 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002509#endif
2510 STORECHAR(ch);
2511 }
2512 return v;
2513#undef STORECHAR
2514}
2515
2516PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2517{
2518 if (!PyUnicode_Check(unicode)) {
2519 PyErr_BadArgument();
2520 return NULL;
2521 }
2522 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002523 PyUnicode_GET_SIZE(unicode),
2524 NULL,
2525 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002526}
2527
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528/* --- UTF-16 Codec ------------------------------------------------------- */
2529
Tim Peters772747b2001-08-09 22:21:55 +00002530PyObject *
2531PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002532 Py_ssize_t size,
2533 const char *errors,
2534 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535{
Walter Dörwald69652032004-09-07 20:24:22 +00002536 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2537}
2538
2539PyObject *
2540PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002541 Py_ssize_t size,
2542 const char *errors,
2543 int *byteorder,
2544 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002545{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002546 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002547 Py_ssize_t startinpos;
2548 Py_ssize_t endinpos;
2549 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 PyUnicodeObject *unicode;
2551 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002552 const unsigned char *q, *e;
2553 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002554 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002555 /* Offsets from q for retrieving byte pairs in the right order. */
2556#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2557 int ihi = 1, ilo = 0;
2558#else
2559 int ihi = 0, ilo = 1;
2560#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 PyObject *errorHandler = NULL;
2562 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563
2564 /* Note: size will always be longer than the resulting Unicode
2565 character count */
2566 unicode = _PyUnicode_New(size);
2567 if (!unicode)
2568 return NULL;
2569 if (size == 0)
2570 return (PyObject *)unicode;
2571
2572 /* Unpack UTF-16 encoded data */
2573 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002574 q = (unsigned char *)s;
2575 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576
2577 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002578 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002580 /* Check for BOM marks (U+FEFF) in the input and adjust current
2581 byte order setting accordingly. In native mode, the leading BOM
2582 mark is skipped, in all other modes, it is copied to the output
2583 stream as-is (giving a ZWNBSP character). */
2584 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002585 if (size >= 2) {
2586 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002587#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002588 if (bom == 0xFEFF) {
2589 q += 2;
2590 bo = -1;
2591 }
2592 else if (bom == 0xFFFE) {
2593 q += 2;
2594 bo = 1;
2595 }
Tim Petersced69f82003-09-16 20:30:58 +00002596#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002597 if (bom == 0xFEFF) {
2598 q += 2;
2599 bo = 1;
2600 }
2601 else if (bom == 0xFFFE) {
2602 q += 2;
2603 bo = -1;
2604 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002605#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002606 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608
Tim Peters772747b2001-08-09 22:21:55 +00002609 if (bo == -1) {
2610 /* force LE */
2611 ihi = 1;
2612 ilo = 0;
2613 }
2614 else if (bo == 1) {
2615 /* force BE */
2616 ihi = 0;
2617 ilo = 1;
2618 }
2619
2620 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002621 Py_UNICODE ch;
2622 /* remaining bytes at the end? (size should be even) */
2623 if (e-q<2) {
2624 if (consumed)
2625 break;
2626 errmsg = "truncated data";
2627 startinpos = ((const char *)q)-starts;
2628 endinpos = ((const char *)e)-starts;
2629 goto utf16Error;
2630 /* The remaining input chars are ignored if the callback
2631 chooses to skip the input */
2632 }
2633 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002634
Benjamin Peterson857ce152009-01-31 16:29:18 +00002635 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002636
2637 if (ch < 0xD800 || ch > 0xDFFF) {
2638 *p++ = ch;
2639 continue;
2640 }
2641
2642 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002643 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002644 q -= 2;
2645 if (consumed)
2646 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002647 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002648 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002649 endinpos = ((const char *)e)-starts;
2650 goto utf16Error;
2651 }
2652 if (0xD800 <= ch && ch <= 0xDBFF) {
2653 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2654 q += 2;
2655 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002656#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002657 *p++ = ch;
2658 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002659#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002660 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002661#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002662 continue;
2663 }
2664 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002665 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002666 startinpos = (((const char *)q)-4)-starts;
2667 endinpos = startinpos+2;
2668 goto utf16Error;
2669 }
2670
Benjamin Peterson857ce152009-01-31 16:29:18 +00002671 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002672 errmsg = "illegal encoding";
2673 startinpos = (((const char *)q)-2)-starts;
2674 endinpos = startinpos+2;
2675 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002676
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002677 utf16Error:
2678 outpos = p-PyUnicode_AS_UNICODE(unicode);
2679 if (unicode_decode_call_errorhandler(
2680 errors, &errorHandler,
2681 "utf16", errmsg,
2682 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2683 &unicode, &outpos, &p))
2684 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 }
2686
2687 if (byteorder)
2688 *byteorder = bo;
2689
Walter Dörwald69652032004-09-07 20:24:22 +00002690 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002691 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002692
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002694 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 goto onError;
2696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002697 Py_XDECREF(errorHandler);
2698 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 return (PyObject *)unicode;
2700
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002701 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002703 Py_XDECREF(errorHandler);
2704 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 return NULL;
2706}
2707
Tim Peters772747b2001-08-09 22:21:55 +00002708PyObject *
2709PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002710 Py_ssize_t size,
2711 const char *errors,
2712 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713{
2714 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002715 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002716 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002717#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002718 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002719#else
2720 const int pairs = 0;
2721#endif
Tim Peters772747b2001-08-09 22:21:55 +00002722 /* Offsets from p for storing byte pairs in the right order. */
2723#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2724 int ihi = 1, ilo = 0;
2725#else
2726 int ihi = 0, ilo = 1;
2727#endif
2728
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002729#define STORECHAR(CH) \
2730 do { \
2731 p[ihi] = ((CH) >> 8) & 0xff; \
2732 p[ilo] = (CH) & 0xff; \
2733 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002734 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002736#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002737 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002738 if (s[i] >= 0x10000)
2739 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002740#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002741 /* 2 * (size + pairs + (byteorder == 0)) */
2742 if (size > PY_SSIZE_T_MAX ||
2743 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002744 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002745 nsize = size + pairs + (byteorder == 0);
2746 bytesize = nsize * 2;
2747 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002748 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002749 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 if (v == NULL)
2751 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002753 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002755 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002756 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002757 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002758
2759 if (byteorder == -1) {
2760 /* force LE */
2761 ihi = 1;
2762 ilo = 0;
2763 }
2764 else if (byteorder == 1) {
2765 /* force BE */
2766 ihi = 0;
2767 ilo = 1;
2768 }
2769
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002770 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002771 Py_UNICODE ch = *s++;
2772 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002773#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002774 if (ch >= 0x10000) {
2775 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2776 ch = 0xD800 | ((ch-0x10000) >> 10);
2777 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002778#endif
Tim Peters772747b2001-08-09 22:21:55 +00002779 STORECHAR(ch);
2780 if (ch2)
2781 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002784#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785}
2786
2787PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2788{
2789 if (!PyUnicode_Check(unicode)) {
2790 PyErr_BadArgument();
2791 return NULL;
2792 }
2793 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002794 PyUnicode_GET_SIZE(unicode),
2795 NULL,
2796 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797}
2798
2799/* --- Unicode Escape Codec ----------------------------------------------- */
2800
Fredrik Lundh06d12682001-01-24 07:59:11 +00002801static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002802
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002804 Py_ssize_t size,
2805 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002808 Py_ssize_t startinpos;
2809 Py_ssize_t endinpos;
2810 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002812 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002814 char* message;
2815 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 PyObject *errorHandler = NULL;
2817 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002818
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 /* Escaped strings will always be longer than the resulting
2820 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821 length after conversion to the true value.
2822 (but if the error callback returns a long replacement string
2823 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 v = _PyUnicode_New(size);
2825 if (v == NULL)
2826 goto onError;
2827 if (size == 0)
2828 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002829
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002830 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002832
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 while (s < end) {
2834 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002835 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837
2838 /* Non-escape characters are interpreted as Unicode ordinals */
2839 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002840 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 continue;
2842 }
2843
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002844 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 /* \ - Escapes */
2846 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002847 c = *s++;
2848 if (s > end)
2849 c = '\0'; /* Invalid after \ */
2850 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002852 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 case '\n': break;
2854 case '\\': *p++ = '\\'; break;
2855 case '\'': *p++ = '\''; break;
2856 case '\"': *p++ = '\"'; break;
2857 case 'b': *p++ = '\b'; break;
2858 case 'f': *p++ = '\014'; break; /* FF */
2859 case 't': *p++ = '\t'; break;
2860 case 'n': *p++ = '\n'; break;
2861 case 'r': *p++ = '\r'; break;
2862 case 'v': *p++ = '\013'; break; /* VT */
2863 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2864
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002865 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 case '0': case '1': case '2': case '3':
2867 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002868 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002869 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002870 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002871 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002872 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002874 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 break;
2876
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002877 /* hex escapes */
2878 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002880 digits = 2;
2881 message = "truncated \\xXX escape";
2882 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002884 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002886 digits = 4;
2887 message = "truncated \\uXXXX escape";
2888 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002890 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002891 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002892 digits = 8;
2893 message = "truncated \\UXXXXXXXX escape";
2894 hexescape:
2895 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002896 if (end - s < digits) {
2897 /* count only hex digits */
2898 for (; s < end; ++s) {
2899 c = (unsigned char)*s;
2900 if (!Py_ISXDIGIT(c))
2901 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002902 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002903 goto error;
2904 }
2905 for (; digits--; ++s) {
2906 c = (unsigned char)*s;
2907 if (!Py_ISXDIGIT(c))
2908 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002909 chr = (chr<<4) & ~0xF;
2910 if (c >= '0' && c <= '9')
2911 chr += c - '0';
2912 else if (c >= 'a' && c <= 'f')
2913 chr += 10 + c - 'a';
2914 else
2915 chr += 10 + c - 'A';
2916 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002917 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002918 /* _decoding_error will have already written into the
2919 target buffer. */
2920 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002921 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002922 /* when we get here, chr is a 32-bit unicode character */
2923 if (chr <= 0xffff)
2924 /* UCS-2 character */
2925 *p++ = (Py_UNICODE) chr;
2926 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002927 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002928 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002929#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002930 *p++ = chr;
2931#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002932 chr -= 0x10000L;
2933 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002934 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002935#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002936 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002937 message = "illegal Unicode character";
2938 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002939 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002940 break;
2941
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002942 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002943 case 'N':
2944 message = "malformed \\N character escape";
2945 if (ucnhash_CAPI == NULL) {
2946 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002947 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002948 if (ucnhash_CAPI == NULL)
2949 goto ucnhashError;
2950 }
2951 if (*s == '{') {
2952 const char *start = s+1;
2953 /* look for the closing brace */
2954 while (*s != '}' && s < end)
2955 s++;
2956 if (s > start && s < end && *s == '}') {
2957 /* found a name. look it up in the unicode database */
2958 message = "unknown Unicode character name";
2959 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002960 if (s - start - 1 <= INT_MAX &&
2961 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002962 goto store;
2963 }
2964 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002965 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002966
2967 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002968 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002969 message = "\\ at end of string";
2970 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002971 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002972 }
2973 else {
2974 *p++ = '\\';
2975 *p++ = (unsigned char)s[-1];
2976 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002977 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002979 continue;
2980
2981 error:
2982 endinpos = s-starts;
2983 outpos = p-PyUnicode_AS_UNICODE(v);
2984 if (unicode_decode_call_errorhandler(
2985 errors, &errorHandler,
2986 "unicodeescape", message,
2987 starts, size, &startinpos, &endinpos, &exc, &s,
2988 &v, &outpos, &p))
2989 goto onError;
2990 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002992 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002993 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002994 Py_XDECREF(errorHandler);
2995 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002997
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002998 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002999 PyErr_SetString(
3000 PyExc_UnicodeError,
3001 "\\N escapes not supported (can't load unicodedata module)"
3002 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003003 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003004 Py_XDECREF(errorHandler);
3005 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003006 return NULL;
3007
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003008 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003010 Py_XDECREF(errorHandler);
3011 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012 return NULL;
3013}
3014
3015/* Return a Unicode-Escape string version of the Unicode object.
3016
3017 If quotes is true, the string is enclosed in u"" or u'' quotes as
3018 appropriate.
3019
3020*/
3021
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00003022Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003023 Py_ssize_t size,
3024 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00003025{
3026 /* like wcschr, but doesn't stop at NULL characters */
3027
3028 while (size-- > 0) {
3029 if (*s == ch)
3030 return s;
3031 s++;
3032 }
3033
3034 return NULL;
3035}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003036
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037static
3038PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003039 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 int quotes)
3041{
3042 PyObject *repr;
3043 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003045 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00003046#ifdef Py_UNICODE_WIDE
3047 const Py_ssize_t expandsize = 10;
3048#else
3049 const Py_ssize_t expandsize = 6;
3050#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051
Neal Norwitz17753ec2006-08-21 22:21:19 +00003052 /* XXX(nnorwitz): rather than over-allocating, it would be
3053 better to choose a different scheme. Perhaps scan the
3054 first N-chars of the string and allocate based on that size.
3055 */
3056 /* Initial allocation is based on the longest-possible unichr
3057 escape.
3058
3059 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3060 unichr, so in this case it's the longest unichr escape. In
3061 narrow (UTF-16) builds this is five chars per source unichr
3062 since there are two unichrs in the surrogate pair, so in narrow
3063 (UTF-16) builds it's not the longest unichr escape.
3064
3065 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3066 so in the narrow (UTF-16) build case it's the longest unichr
3067 escape.
3068 */
3069
Neal Norwitze7d8be82008-07-31 17:17:14 +00003070 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003071 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003072
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003073 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003074 2
3075 + expandsize*size
3076 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 if (repr == NULL)
3078 return NULL;
3079
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003080 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081
3082 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003084 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 !findchar(s, size, '"')) ? '"' : '\'';
3086 }
3087 while (size-- > 0) {
3088 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003089
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003090 /* Escape quotes and backslashes */
3091 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003092 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 *p++ = '\\';
3094 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003095 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003096 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003097
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003098#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003099 /* Map 21-bit characters to '\U00xxxxxx' */
3100 else if (ch >= 0x10000) {
3101 *p++ = '\\';
3102 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003103 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3104 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3105 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3106 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3107 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3108 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3109 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003110 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003111 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003112 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003113#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003114 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3115 else if (ch >= 0xD800 && ch < 0xDC00) {
3116 Py_UNICODE ch2;
3117 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003118
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003119 ch2 = *s++;
3120 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003121 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003122 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3123 *p++ = '\\';
3124 *p++ = 'U';
3125 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3126 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3127 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3128 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3129 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3130 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3131 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3132 *p++ = hexdigit[ucs & 0x0000000F];
3133 continue;
3134 }
3135 /* Fall through: isolated surrogates are copied as-is */
3136 s--;
3137 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003138 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003139#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003140
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003142 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 *p++ = '\\';
3144 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003145 *p++ = hexdigit[(ch >> 12) & 0x000F];
3146 *p++ = hexdigit[(ch >> 8) & 0x000F];
3147 *p++ = hexdigit[(ch >> 4) & 0x000F];
3148 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003150
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003151 /* Map special whitespace to '\t', \n', '\r' */
3152 else if (ch == '\t') {
3153 *p++ = '\\';
3154 *p++ = 't';
3155 }
3156 else if (ch == '\n') {
3157 *p++ = '\\';
3158 *p++ = 'n';
3159 }
3160 else if (ch == '\r') {
3161 *p++ = '\\';
3162 *p++ = 'r';
3163 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003164
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003165 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003166 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003168 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003169 *p++ = hexdigit[(ch >> 4) & 0x000F];
3170 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003171 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003172
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 /* Copy everything else as-is */
3174 else
3175 *p++ = (char) ch;
3176 }
3177 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003178 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179
3180 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003181 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3182 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 return repr;
3184}
3185
3186PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003187 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188{
3189 return unicodeescape_string(s, size, 0);
3190}
3191
3192PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3193{
3194 if (!PyUnicode_Check(unicode)) {
3195 PyErr_BadArgument();
3196 return NULL;
3197 }
3198 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003199 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200}
3201
3202/* --- Raw Unicode Escape Codec ------------------------------------------- */
3203
3204PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003205 Py_ssize_t size,
3206 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003209 Py_ssize_t startinpos;
3210 Py_ssize_t endinpos;
3211 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003213 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 const char *end;
3215 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 PyObject *errorHandler = NULL;
3217 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003218
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 /* Escaped strings will always be longer than the resulting
3220 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 length after conversion to the true value. (But decoding error
3222 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 v = _PyUnicode_New(size);
3224 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003227 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003228 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 end = s + size;
3230 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003231 unsigned char c;
3232 Py_UCS4 x;
3233 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003234 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003236 /* Non-escape characters are interpreted as Unicode ordinals */
3237 if (*s != '\\') {
3238 *p++ = (unsigned char)*s++;
3239 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003240 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003241 startinpos = s-starts;
3242
3243 /* \u-escapes are only interpreted iff the number of leading
3244 backslashes if odd */
3245 bs = s;
3246 for (;s < end;) {
3247 if (*s != '\\')
3248 break;
3249 *p++ = (unsigned char)*s++;
3250 }
3251 if (((s - bs) & 1) == 0 ||
3252 s >= end ||
3253 (*s != 'u' && *s != 'U')) {
3254 continue;
3255 }
3256 p--;
3257 count = *s=='u' ? 4 : 8;
3258 s++;
3259
3260 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3261 outpos = p-PyUnicode_AS_UNICODE(v);
3262 for (x = 0, i = 0; i < count; ++i, ++s) {
3263 c = (unsigned char)*s;
3264 if (!isxdigit(c)) {
3265 endinpos = s-starts;
3266 if (unicode_decode_call_errorhandler(
3267 errors, &errorHandler,
3268 "rawunicodeescape", "truncated \\uXXXX",
3269 starts, size, &startinpos, &endinpos, &exc, &s,
3270 &v, &outpos, &p))
3271 goto onError;
3272 goto nextByte;
3273 }
3274 x = (x<<4) & ~0xF;
3275 if (c >= '0' && c <= '9')
3276 x += c - '0';
3277 else if (c >= 'a' && c <= 'f')
3278 x += 10 + c - 'a';
3279 else
3280 x += 10 + c - 'A';
3281 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003282 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003283 /* UCS-2 character */
3284 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003285 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003286 /* UCS-4 character. Either store directly, or as
3287 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003288#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003289 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003290#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003291 x -= 0x10000L;
3292 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3293 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003294#endif
3295 } else {
3296 endinpos = s-starts;
3297 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003298 if (unicode_decode_call_errorhandler(
3299 errors, &errorHandler,
3300 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003301 starts, size, &startinpos, &endinpos, &exc, &s,
3302 &v, &outpos, &p))
3303 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003304 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003305 nextByte:
3306 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003308 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003309 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003310 Py_XDECREF(errorHandler);
3311 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003313
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003314 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003316 Py_XDECREF(errorHandler);
3317 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 return NULL;
3319}
3320
3321PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003322 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323{
3324 PyObject *repr;
3325 char *p;
3326 char *q;
3327
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003328 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003329#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003330 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003331#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003332 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003333#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003334
Neal Norwitze7d8be82008-07-31 17:17:14 +00003335 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003336 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003337
Neal Norwitze7d8be82008-07-31 17:17:14 +00003338 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 if (repr == NULL)
3340 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003341 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003342 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003344 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 while (size-- > 0) {
3346 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003347#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003348 /* Map 32-bit characters to '\Uxxxxxxxx' */
3349 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003350 *p++ = '\\';
3351 *p++ = 'U';
3352 *p++ = hexdigit[(ch >> 28) & 0xf];
3353 *p++ = hexdigit[(ch >> 24) & 0xf];
3354 *p++ = hexdigit[(ch >> 20) & 0xf];
3355 *p++ = hexdigit[(ch >> 16) & 0xf];
3356 *p++ = hexdigit[(ch >> 12) & 0xf];
3357 *p++ = hexdigit[(ch >> 8) & 0xf];
3358 *p++ = hexdigit[(ch >> 4) & 0xf];
3359 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003360 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003361 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003362#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003363 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3364 if (ch >= 0xD800 && ch < 0xDC00) {
3365 Py_UNICODE ch2;
3366 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003367
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003368 ch2 = *s++;
3369 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003370 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003371 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3372 *p++ = '\\';
3373 *p++ = 'U';
3374 *p++ = hexdigit[(ucs >> 28) & 0xf];
3375 *p++ = hexdigit[(ucs >> 24) & 0xf];
3376 *p++ = hexdigit[(ucs >> 20) & 0xf];
3377 *p++ = hexdigit[(ucs >> 16) & 0xf];
3378 *p++ = hexdigit[(ucs >> 12) & 0xf];
3379 *p++ = hexdigit[(ucs >> 8) & 0xf];
3380 *p++ = hexdigit[(ucs >> 4) & 0xf];
3381 *p++ = hexdigit[ucs & 0xf];
3382 continue;
3383 }
3384 /* Fall through: isolated surrogates are copied as-is */
3385 s--;
3386 size++;
3387 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003388#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003389 /* Map 16-bit characters to '\uxxxx' */
3390 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 *p++ = '\\';
3392 *p++ = 'u';
3393 *p++ = hexdigit[(ch >> 12) & 0xf];
3394 *p++ = hexdigit[(ch >> 8) & 0xf];
3395 *p++ = hexdigit[(ch >> 4) & 0xf];
3396 *p++ = hexdigit[ch & 15];
3397 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003398 /* Copy everything else as-is */
3399 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400 *p++ = (char) ch;
3401 }
3402 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003403 if (_PyString_Resize(&repr, p - q))
3404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405 return repr;
3406}
3407
3408PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3409{
3410 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003411 PyErr_BadArgument();
3412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003413 }
3414 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003415 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416}
3417
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003418/* --- Unicode Internal Codec ------------------------------------------- */
3419
3420PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003421 Py_ssize_t size,
3422 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003423{
3424 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003425 Py_ssize_t startinpos;
3426 Py_ssize_t endinpos;
3427 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003428 PyUnicodeObject *v;
3429 Py_UNICODE *p;
3430 const char *end;
3431 const char *reason;
3432 PyObject *errorHandler = NULL;
3433 PyObject *exc = NULL;
3434
Neal Norwitzd43069c2006-01-08 01:12:10 +00003435#ifdef Py_UNICODE_WIDE
3436 Py_UNICODE unimax = PyUnicode_GetMax();
3437#endif
3438
Armin Rigo7ccbca92006-10-04 12:17:45 +00003439 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003440 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3441 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003442 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003443 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003444 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003445 p = PyUnicode_AS_UNICODE(v);
3446 end = s + size;
3447
3448 while (s < end) {
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003449 if (end-s < Py_UNICODE_SIZE) {
3450 endinpos = end-starts;
3451 reason = "truncated input";
3452 goto error;
3453 }
Neal Norwitz1004a532006-05-15 07:17:23 +00003454 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003455#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003456 /* We have to sanity check the raw data, otherwise doom looms for
3457 some malformed UCS-4 data. */
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003458 if (*p > unimax || *p < 0) {
3459 endinpos = s - starts + Py_UNICODE_SIZE;
3460 reason = "illegal code point (> 0x10FFFF)";
3461 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003462 }
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003463#endif
3464 p++;
3465 s += Py_UNICODE_SIZE;
3466 continue;
3467
3468 error:
3469 startinpos = s - starts;
3470 outpos = p - PyUnicode_AS_UNICODE(v);
3471 if (unicode_decode_call_errorhandler(
3472 errors, &errorHandler,
3473 "unicode_internal", reason,
3474 starts, size, &startinpos, &endinpos, &exc, &s,
3475 &v, &outpos, &p)) {
3476 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003477 }
3478 }
3479
Martin v. Löwis412fb672006-04-13 06:34:32 +00003480 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003481 goto onError;
3482 Py_XDECREF(errorHandler);
3483 Py_XDECREF(exc);
3484 return (PyObject *)v;
3485
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003486 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003487 Py_XDECREF(v);
3488 Py_XDECREF(errorHandler);
3489 Py_XDECREF(exc);
3490 return NULL;
3491}
3492
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493/* --- Latin-1 Codec ------------------------------------------------------ */
3494
3495PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003496 Py_ssize_t size,
3497 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498{
3499 PyUnicodeObject *v;
3500 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003501
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003503 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003504 Py_UNICODE r = *(unsigned char*)s;
3505 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003506 }
3507
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508 v = _PyUnicode_New(size);
3509 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003510 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003512 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 p = PyUnicode_AS_UNICODE(v);
3514 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003515 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003517
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003518 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 Py_XDECREF(v);
3520 return NULL;
3521}
3522
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523/* create or adjust a UnicodeEncodeError */
3524static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003525 const char *encoding,
3526 const Py_UNICODE *unicode, Py_ssize_t size,
3527 Py_ssize_t startpos, Py_ssize_t endpos,
3528 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003531 *exceptionObject = PyUnicodeEncodeError_Create(
3532 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533 }
3534 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003535 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3536 goto onError;
3537 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3538 goto onError;
3539 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3540 goto onError;
3541 return;
3542 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02003543 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 }
3545}
3546
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547/* raises a UnicodeEncodeError */
3548static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003549 const char *encoding,
3550 const Py_UNICODE *unicode, Py_ssize_t size,
3551 Py_ssize_t startpos, Py_ssize_t endpos,
3552 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553{
3554 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003555 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003557 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558}
3559
3560/* error handling callback helper:
3561 build arguments, call the callback and check the arguments,
3562 put the result into newpos and return the replacement string, which
3563 has to be freed by the caller */
3564static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003565 PyObject **errorHandler,
3566 const char *encoding, const char *reason,
3567 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3568 Py_ssize_t startpos, Py_ssize_t endpos,
3569 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003571 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572
3573 PyObject *restuple;
3574 PyObject *resunicode;
3575
3576 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003577 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003579 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 }
3581
3582 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003583 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003585 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586
3587 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003588 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003590 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003592 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003593 Py_DECREF(restuple);
3594 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 }
3596 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003597 &resunicode, newpos)) {
3598 Py_DECREF(restuple);
3599 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 }
3601 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003602 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003603 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003604 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3605 Py_DECREF(restuple);
3606 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003607 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003608 Py_INCREF(resunicode);
3609 Py_DECREF(restuple);
3610 return resunicode;
3611}
3612
3613static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003614 Py_ssize_t size,
3615 const char *errors,
3616 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617{
3618 /* output object */
3619 PyObject *res;
3620 /* pointers to the beginning and end+1 of input */
3621 const Py_UNICODE *startp = p;
3622 const Py_UNICODE *endp = p + size;
3623 /* pointer to the beginning of the unencodable characters */
3624 /* const Py_UNICODE *badp = NULL; */
3625 /* pointer into the output */
3626 char *str;
3627 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003628 Py_ssize_t respos = 0;
3629 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003630 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3631 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 PyObject *errorHandler = NULL;
3633 PyObject *exc = NULL;
3634 /* the following variable is used for caching string comparisons
3635 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3636 int known_errorHandler = -1;
3637
3638 /* allocate enough for a simple encoding without
3639 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003640 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641 if (res == NULL)
3642 goto onError;
3643 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003644 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003645 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646 ressize = size;
3647
3648 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003649 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003651 /* can we encode this? */
3652 if (c<limit) {
3653 /* no overflow check, because we know that the space is enough */
3654 *str++ = (char)c;
3655 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003656 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003657 else {
3658 Py_ssize_t unicodepos = p-startp;
3659 Py_ssize_t requiredsize;
3660 PyObject *repunicode;
3661 Py_ssize_t repsize;
3662 Py_ssize_t newpos;
3663 Py_ssize_t respos;
3664 Py_UNICODE *uni2;
3665 /* startpos for collecting unencodable chars */
3666 const Py_UNICODE *collstart = p;
3667 const Py_UNICODE *collend = p;
3668 /* find all unecodable characters */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003669 while ((collend < endp) && ((*collend) >= limit))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003670 ++collend;
3671 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3672 if (known_errorHandler==-1) {
3673 if ((errors==NULL) || (!strcmp(errors, "strict")))
3674 known_errorHandler = 1;
3675 else if (!strcmp(errors, "replace"))
3676 known_errorHandler = 2;
3677 else if (!strcmp(errors, "ignore"))
3678 known_errorHandler = 3;
3679 else if (!strcmp(errors, "xmlcharrefreplace"))
3680 known_errorHandler = 4;
3681 else
3682 known_errorHandler = 0;
3683 }
3684 switch (known_errorHandler) {
3685 case 1: /* strict */
3686 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3687 goto onError;
3688 case 2: /* replace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003689 while (collstart++ < collend)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003690 *str++ = '?'; /* fall through */
3691 case 3: /* ignore */
3692 p = collend;
3693 break;
3694 case 4: /* xmlcharrefreplace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003695 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003696 /* determine replacement size (temporarily (mis)uses p) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003697 requiredsize = respos;
3698 for (p = collstart; p < collend;) {
Serhiy Storchakae822b032013-08-06 16:56:26 +03003699 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003700 Py_ssize_t incr;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003701 if (ch < 10)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003702 incr = 2+1+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003703 else if (ch < 100)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003704 incr = 2+2+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003705 else if (ch < 1000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003706 incr = 2+3+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003707 else if (ch < 10000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003708 incr = 2+4+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003709 else if (ch < 100000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003710 incr = 2+5+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003711 else if (ch < 1000000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003712 incr = 2+6+1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003713 else
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003714 incr = 2+7+1;
3715 if (requiredsize > PY_SSIZE_T_MAX - incr)
3716 goto overflow;
3717 requiredsize += incr;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003718 }
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003719 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3720 goto overflow;
3721 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003722 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003723 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003724 requiredsize = 2*ressize;
3725 if (_PyString_Resize(&res, requiredsize))
3726 goto onError;
3727 str = PyString_AS_STRING(res) + respos;
3728 ressize = requiredsize;
3729 }
3730 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03003731 for (p = collstart; p < collend;) {
3732 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3733 str += sprintf(str, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003734 }
3735 p = collend;
3736 break;
3737 default:
3738 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3739 encoding, reason, startp, size, &exc,
3740 collstart-startp, collend-startp, &newpos);
3741 if (repunicode == NULL)
3742 goto onError;
3743 /* need more space? (at least enough for what we have+the
3744 replacement+the rest of the string, so we won't have to
3745 check space for encodable characters) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003746 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003747 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003748 if (respos > PY_SSIZE_T_MAX - repsize)
3749 goto overflow;
3750 requiredsize = respos + repsize;
3751 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3752 goto overflow;
3753 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003754 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003755 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003756 requiredsize = 2*ressize;
3757 if (_PyString_Resize(&res, requiredsize)) {
3758 Py_DECREF(repunicode);
3759 goto onError;
3760 }
3761 str = PyString_AS_STRING(res) + respos;
3762 ressize = requiredsize;
3763 }
3764 /* check if there is anything unencodable in the replacement
3765 and copy it to the output */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003766 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003767 c = *uni2;
3768 if (c >= limit) {
3769 raise_encode_exception(&exc, encoding, startp, size,
3770 unicodepos, unicodepos+1, reason);
3771 Py_DECREF(repunicode);
3772 goto onError;
3773 }
3774 *str = (char)c;
3775 }
3776 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003777 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003778 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003779 }
3780 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 /* Resize if we allocated to much */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003782 respos = str - PyString_AS_STRING(res);
3783 if (respos < ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003784 /* If this falls res will be NULL */
3785 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 Py_XDECREF(errorHandler);
3787 Py_XDECREF(exc);
3788 return res;
3789
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003790 overflow:
3791 PyErr_SetString(PyExc_OverflowError,
3792 "encoded result is too long for a Python string");
3793
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003794 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795 Py_XDECREF(res);
3796 Py_XDECREF(errorHandler);
3797 Py_XDECREF(exc);
3798 return NULL;
3799}
3800
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003802 Py_ssize_t size,
3803 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003805 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806}
3807
3808PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3809{
3810 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003811 PyErr_BadArgument();
3812 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 }
3814 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003815 PyUnicode_GET_SIZE(unicode),
3816 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817}
3818
3819/* --- 7-bit ASCII Codec -------------------------------------------------- */
3820
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003822 Py_ssize_t size,
3823 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003825 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 PyUnicodeObject *v;
3827 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003828 Py_ssize_t startinpos;
3829 Py_ssize_t endinpos;
3830 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003831 const char *e;
3832 PyObject *errorHandler = NULL;
3833 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003834
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003836 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003837 Py_UNICODE r = *(unsigned char*)s;
3838 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003839 }
Tim Petersced69f82003-09-16 20:30:58 +00003840
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 v = _PyUnicode_New(size);
3842 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003843 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003845 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003847 e = s + size;
3848 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003849 register unsigned char c = (unsigned char)*s;
3850 if (c < 128) {
3851 *p++ = c;
3852 ++s;
3853 }
3854 else {
3855 startinpos = s-starts;
3856 endinpos = startinpos + 1;
3857 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3858 if (unicode_decode_call_errorhandler(
3859 errors, &errorHandler,
3860 "ascii", "ordinal not in range(128)",
3861 starts, size, &startinpos, &endinpos, &exc, &s,
3862 &v, &outpos, &p))
3863 goto onError;
3864 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003866 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003867 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3868 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003869 Py_XDECREF(errorHandler);
3870 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003872
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003873 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003875 Py_XDECREF(errorHandler);
3876 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877 return NULL;
3878}
3879
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003881 Py_ssize_t size,
3882 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003884 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885}
3886
3887PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3888{
3889 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003890 PyErr_BadArgument();
3891 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 }
3893 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003894 PyUnicode_GET_SIZE(unicode),
3895 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896}
3897
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003898#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003899
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003900/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003901
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003902#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003903#define NEED_RETRY
3904#endif
3905
3906/* XXX This code is limited to "true" double-byte encodings, as
3907 a) it assumes an incomplete character consists of a single byte, and
3908 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003909 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003910
3911static int is_dbcs_lead_byte(const char *s, int offset)
3912{
3913 const char *curr = s + offset;
3914
3915 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003916 const char *prev = CharPrev(s, curr);
3917 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003918 }
3919 return 0;
3920}
3921
3922/*
3923 * Decode MBCS string into unicode object. If 'final' is set, converts
3924 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3925 */
3926static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003927 const char *s, /* MBCS string */
3928 int size, /* sizeof MBCS string */
3929 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003930{
3931 Py_UNICODE *p;
3932 Py_ssize_t n = 0;
3933 int usize = 0;
3934
3935 assert(size >= 0);
3936
3937 /* Skip trailing lead-byte unless 'final' is set */
3938 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003939 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003940
3941 /* First get the size of the result */
3942 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003943 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3944 if (usize == 0) {
3945 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3946 return -1;
3947 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003948 }
3949
3950 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003951 /* Create unicode object */
3952 *v = _PyUnicode_New(usize);
3953 if (*v == NULL)
3954 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003955 }
3956 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003957 /* Extend unicode object */
3958 n = PyUnicode_GET_SIZE(*v);
3959 if (_PyUnicode_Resize(v, n + usize) < 0)
3960 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003961 }
3962
3963 /* Do the conversion */
3964 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003965 p = PyUnicode_AS_UNICODE(*v) + n;
3966 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3967 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3968 return -1;
3969 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003970 }
3971
3972 return size;
3973}
3974
3975PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003976 Py_ssize_t size,
3977 const char *errors,
3978 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003979{
3980 PyUnicodeObject *v = NULL;
3981 int done;
3982
3983 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003984 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003985
3986#ifdef NEED_RETRY
3987 retry:
3988 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003989 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003990 else
3991#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003992 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003993
3994 if (done < 0) {
3995 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003996 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003997 }
3998
3999 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004000 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004001
4002#ifdef NEED_RETRY
4003 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004004 s += done;
4005 size -= done;
4006 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004007 }
4008#endif
4009
4010 return (PyObject *)v;
4011}
4012
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004013PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004014 Py_ssize_t size,
4015 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004016{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004017 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4018}
4019
4020/*
4021 * Convert unicode into string object (MBCS).
4022 * Returns 0 if succeed, -1 otherwise.
4023 */
4024static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004025 const Py_UNICODE *p, /* unicode */
4026 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004027{
4028 int mbcssize = 0;
4029 Py_ssize_t n = 0;
4030
4031 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004032
4033 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004034 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004035 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4036 if (mbcssize == 0) {
4037 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4038 return -1;
4039 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004040 }
4041
Martin v. Löwisd8251432006-06-14 05:21:04 +00004042 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004043 /* Create string object */
4044 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4045 if (*repr == NULL)
4046 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004047 }
4048 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004049 /* Extend string object */
4050 n = PyString_Size(*repr);
4051 if (_PyString_Resize(repr, n + mbcssize) < 0)
4052 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004053 }
4054
4055 /* Do the conversion */
4056 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004057 char *s = PyString_AS_STRING(*repr) + n;
4058 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4059 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4060 return -1;
4061 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004062 }
4063
4064 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004065}
4066
4067PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004068 Py_ssize_t size,
4069 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004070{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004071 PyObject *repr = NULL;
4072 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004073
Martin v. Löwisd8251432006-06-14 05:21:04 +00004074#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004075 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004076 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004077 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004078 else
4079#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004080 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004081
Martin v. Löwisd8251432006-06-14 05:21:04 +00004082 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004083 Py_XDECREF(repr);
4084 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004085 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004086
4087#ifdef NEED_RETRY
4088 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004089 p += INT_MAX;
4090 size -= INT_MAX;
4091 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004092 }
4093#endif
4094
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004095 return repr;
4096}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004097
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004098PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4099{
4100 if (!PyUnicode_Check(unicode)) {
4101 PyErr_BadArgument();
4102 return NULL;
4103 }
4104 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004105 PyUnicode_GET_SIZE(unicode),
4106 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004107}
4108
Martin v. Löwisd8251432006-06-14 05:21:04 +00004109#undef NEED_RETRY
4110
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004111#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004112
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113/* --- Character Mapping Codec -------------------------------------------- */
4114
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004116 Py_ssize_t size,
4117 PyObject *mapping,
4118 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004121 Py_ssize_t startinpos;
4122 Py_ssize_t endinpos;
4123 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 PyUnicodeObject *v;
4126 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004127 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 PyObject *errorHandler = NULL;
4129 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004130 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004131 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004132
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 /* Default to Latin-1 */
4134 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004135 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136
4137 v = _PyUnicode_New(size);
4138 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004139 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004141 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004144 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004145 mapstring = PyUnicode_AS_UNICODE(mapping);
4146 maplen = PyUnicode_GET_SIZE(mapping);
4147 while (s < e) {
4148 unsigned char ch = *s;
4149 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004151 if (ch < maplen)
4152 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004154 if (x == 0xfffe) {
4155 /* undefined mapping */
4156 outpos = p-PyUnicode_AS_UNICODE(v);
4157 startinpos = s-starts;
4158 endinpos = startinpos+1;
4159 if (unicode_decode_call_errorhandler(
4160 errors, &errorHandler,
4161 "charmap", "character maps to <undefined>",
4162 starts, size, &startinpos, &endinpos, &exc, &s,
4163 &v, &outpos, &p)) {
4164 goto onError;
4165 }
4166 continue;
4167 }
4168 *p++ = x;
4169 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004170 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004171 }
4172 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004173 while (s < e) {
4174 unsigned char ch = *s;
4175 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004176
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004177 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4178 w = PyInt_FromLong((long)ch);
4179 if (w == NULL)
4180 goto onError;
4181 x = PyObject_GetItem(mapping, w);
4182 Py_DECREF(w);
4183 if (x == NULL) {
4184 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4185 /* No mapping found means: mapping is undefined. */
4186 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004187 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004188 } else
4189 goto onError;
4190 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004191
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004192 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004193 if (x == Py_None)
4194 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004195 if (PyInt_Check(x)) {
4196 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004197 if (value == 0xFFFE)
4198 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004199 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004200 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004201 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004202 Py_DECREF(x);
4203 goto onError;
4204 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004205
4206#ifndef Py_UNICODE_WIDE
4207 if (value > 0xFFFF) {
4208 /* see the code for 1-n mapping below */
4209 if (extrachars < 2) {
4210 /* resize first */
4211 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4212 Py_ssize_t needed = 10 - extrachars;
4213 extrachars += needed;
4214 /* XXX overflow detection missing */
4215 if (_PyUnicode_Resize(&v,
4216 PyUnicode_GET_SIZE(v) + needed) < 0) {
4217 Py_DECREF(x);
4218 goto onError;
4219 }
4220 p = PyUnicode_AS_UNICODE(v) + oldpos;
4221 }
4222 value -= 0x10000;
4223 *p++ = 0xD800 | (value >> 10);
4224 *p++ = 0xDC00 | (value & 0x3FF);
4225 extrachars -= 2;
4226 }
4227 else
4228#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004229 *p++ = (Py_UNICODE)value;
4230 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004231 else if (PyUnicode_Check(x)) {
4232 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004233
Serhiy Storchaka95997452013-01-15 14:42:59 +02004234 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004235 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004236 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4237 if (value == 0xFFFE)
4238 goto Undefined;
4239 *p++ = value;
4240 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004241 else if (targetsize > 1) {
4242 /* 1-n mapping */
4243 if (targetsize > extrachars) {
4244 /* resize first */
4245 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4246 Py_ssize_t needed = (targetsize - extrachars) + \
4247 (targetsize << 2);
4248 extrachars += needed;
4249 /* XXX overflow detection missing */
4250 if (_PyUnicode_Resize(&v,
4251 PyUnicode_GET_SIZE(v) + needed) < 0) {
4252 Py_DECREF(x);
4253 goto onError;
4254 }
4255 p = PyUnicode_AS_UNICODE(v) + oldpos;
4256 }
4257 Py_UNICODE_COPY(p,
4258 PyUnicode_AS_UNICODE(x),
4259 targetsize);
4260 p += targetsize;
4261 extrachars -= targetsize;
4262 }
4263 /* 1-0 mapping: skip the character */
4264 }
4265 else {
4266 /* wrong return value */
4267 PyErr_SetString(PyExc_TypeError,
4268 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004269 Py_DECREF(x);
4270 goto onError;
4271 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004272 Py_DECREF(x);
4273 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004274 continue;
4275Undefined:
4276 /* undefined mapping */
4277 Py_XDECREF(x);
4278 outpos = p-PyUnicode_AS_UNICODE(v);
4279 startinpos = s-starts;
4280 endinpos = startinpos+1;
4281 if (unicode_decode_call_errorhandler(
4282 errors, &errorHandler,
4283 "charmap", "character maps to <undefined>",
4284 starts, size, &startinpos, &endinpos, &exc, &s,
4285 &v, &outpos, &p)) {
4286 goto onError;
4287 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289 }
4290 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004291 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4292 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293 Py_XDECREF(errorHandler);
4294 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004296
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004297 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298 Py_XDECREF(errorHandler);
4299 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 Py_XDECREF(v);
4301 return NULL;
4302}
4303
Martin v. Löwis3f767792006-06-04 19:36:28 +00004304/* Charmap encoding: the lookup table */
4305
4306struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004307 PyObject_HEAD
4308 unsigned char level1[32];
4309 int count2, count3;
4310 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004311};
4312
4313static PyObject*
4314encoding_map_size(PyObject *obj, PyObject* args)
4315{
4316 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004317 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004318 128*map->count3);
4319}
4320
4321static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004322 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004323 PyDoc_STR("Return the size (in bytes) of this object") },
4324 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004325};
4326
4327static void
4328encoding_map_dealloc(PyObject* o)
4329{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004330 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004331}
4332
4333static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004334 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004335 "EncodingMap", /*tp_name*/
4336 sizeof(struct encoding_map), /*tp_basicsize*/
4337 0, /*tp_itemsize*/
4338 /* methods */
4339 encoding_map_dealloc, /*tp_dealloc*/
4340 0, /*tp_print*/
4341 0, /*tp_getattr*/
4342 0, /*tp_setattr*/
4343 0, /*tp_compare*/
4344 0, /*tp_repr*/
4345 0, /*tp_as_number*/
4346 0, /*tp_as_sequence*/
4347 0, /*tp_as_mapping*/
4348 0, /*tp_hash*/
4349 0, /*tp_call*/
4350 0, /*tp_str*/
4351 0, /*tp_getattro*/
4352 0, /*tp_setattro*/
4353 0, /*tp_as_buffer*/
4354 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4355 0, /*tp_doc*/
4356 0, /*tp_traverse*/
4357 0, /*tp_clear*/
4358 0, /*tp_richcompare*/
4359 0, /*tp_weaklistoffset*/
4360 0, /*tp_iter*/
4361 0, /*tp_iternext*/
4362 encoding_map_methods, /*tp_methods*/
4363 0, /*tp_members*/
4364 0, /*tp_getset*/
4365 0, /*tp_base*/
4366 0, /*tp_dict*/
4367 0, /*tp_descr_get*/
4368 0, /*tp_descr_set*/
4369 0, /*tp_dictoffset*/
4370 0, /*tp_init*/
4371 0, /*tp_alloc*/
4372 0, /*tp_new*/
4373 0, /*tp_free*/
4374 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004375};
4376
4377PyObject*
4378PyUnicode_BuildEncodingMap(PyObject* string)
4379{
4380 Py_UNICODE *decode;
4381 PyObject *result;
4382 struct encoding_map *mresult;
4383 int i;
4384 int need_dict = 0;
4385 unsigned char level1[32];
4386 unsigned char level2[512];
4387 unsigned char *mlevel1, *mlevel2, *mlevel3;
4388 int count2 = 0, count3 = 0;
4389
4390 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4391 PyErr_BadArgument();
4392 return NULL;
4393 }
4394 decode = PyUnicode_AS_UNICODE(string);
4395 memset(level1, 0xFF, sizeof level1);
4396 memset(level2, 0xFF, sizeof level2);
4397
4398 /* If there isn't a one-to-one mapping of NULL to \0,
4399 or if there are non-BMP characters, we need to use
4400 a mapping dictionary. */
4401 if (decode[0] != 0)
4402 need_dict = 1;
4403 for (i = 1; i < 256; i++) {
4404 int l1, l2;
4405 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004406#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004407 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004408#endif
4409 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004410 need_dict = 1;
4411 break;
4412 }
4413 if (decode[i] == 0xFFFE)
4414 /* unmapped character */
4415 continue;
4416 l1 = decode[i] >> 11;
4417 l2 = decode[i] >> 7;
4418 if (level1[l1] == 0xFF)
4419 level1[l1] = count2++;
4420 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004421 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004422 }
4423
4424 if (count2 >= 0xFF || count3 >= 0xFF)
4425 need_dict = 1;
4426
4427 if (need_dict) {
4428 PyObject *result = PyDict_New();
4429 PyObject *key, *value;
4430 if (!result)
4431 return NULL;
4432 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004433 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004434 key = PyInt_FromLong(decode[i]);
4435 value = PyInt_FromLong(i);
4436 if (!key || !value)
4437 goto failed1;
4438 if (PyDict_SetItem(result, key, value) == -1)
4439 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004440 Py_DECREF(key);
4441 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004442 }
4443 return result;
4444 failed1:
4445 Py_XDECREF(key);
4446 Py_XDECREF(value);
4447 Py_DECREF(result);
4448 return NULL;
4449 }
4450
4451 /* Create a three-level trie */
4452 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4453 16*count2 + 128*count3 - 1);
4454 if (!result)
4455 return PyErr_NoMemory();
4456 PyObject_Init(result, &EncodingMapType);
4457 mresult = (struct encoding_map*)result;
4458 mresult->count2 = count2;
4459 mresult->count3 = count3;
4460 mlevel1 = mresult->level1;
4461 mlevel2 = mresult->level23;
4462 mlevel3 = mresult->level23 + 16*count2;
4463 memcpy(mlevel1, level1, 32);
4464 memset(mlevel2, 0xFF, 16*count2);
4465 memset(mlevel3, 0, 128*count3);
4466 count3 = 0;
4467 for (i = 1; i < 256; i++) {
4468 int o1, o2, o3, i2, i3;
4469 if (decode[i] == 0xFFFE)
4470 /* unmapped character */
4471 continue;
4472 o1 = decode[i]>>11;
4473 o2 = (decode[i]>>7) & 0xF;
4474 i2 = 16*mlevel1[o1] + o2;
4475 if (mlevel2[i2] == 0xFF)
4476 mlevel2[i2] = count3++;
4477 o3 = decode[i] & 0x7F;
4478 i3 = 128*mlevel2[i2] + o3;
4479 mlevel3[i3] = i;
4480 }
4481 return result;
4482}
4483
4484static int
4485encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4486{
4487 struct encoding_map *map = (struct encoding_map*)mapping;
4488 int l1 = c>>11;
4489 int l2 = (c>>7) & 0xF;
4490 int l3 = c & 0x7F;
4491 int i;
4492
4493#ifdef Py_UNICODE_WIDE
4494 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004495 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004496 }
4497#endif
4498 if (c == 0)
4499 return 0;
4500 /* level 1*/
4501 i = map->level1[l1];
4502 if (i == 0xFF) {
4503 return -1;
4504 }
4505 /* level 2*/
4506 i = map->level23[16*i+l2];
4507 if (i == 0xFF) {
4508 return -1;
4509 }
4510 /* level 3 */
4511 i = map->level23[16*map->count2 + 128*i + l3];
4512 if (i == 0) {
4513 return -1;
4514 }
4515 return i;
4516}
4517
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518/* Lookup the character ch in the mapping. If the character
4519 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004520 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 PyObject *w = PyInt_FromLong((long)c);
4524 PyObject *x;
4525
4526 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004527 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 x = PyObject_GetItem(mapping, w);
4529 Py_DECREF(w);
4530 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004531 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4532 /* No mapping found means: mapping is undefined. */
4533 PyErr_Clear();
4534 x = Py_None;
4535 Py_INCREF(x);
4536 return x;
4537 } else
4538 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004540 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004541 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004543 long value = PyInt_AS_LONG(x);
4544 if (value < 0 || value > 255) {
4545 PyErr_SetString(PyExc_TypeError,
4546 "character mapping must be in range(256)");
4547 Py_DECREF(x);
4548 return NULL;
4549 }
4550 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004552 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004553 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004555 /* wrong return value */
4556 PyErr_SetString(PyExc_TypeError,
4557 "character mapping must return integer, None or str");
4558 Py_DECREF(x);
4559 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560 }
4561}
4562
Martin v. Löwis3f767792006-06-04 19:36:28 +00004563static int
4564charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4565{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004566 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4567 /* exponentially overallocate to minimize reallocations */
4568 if (requiredsize < 2*outsize)
4569 requiredsize = 2*outsize;
4570 if (_PyString_Resize(outobj, requiredsize)) {
4571 return 0;
4572 }
4573 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004574}
4575
Benjamin Peterson857ce152009-01-31 16:29:18 +00004576typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004577 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004578}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579/* lookup the character, put the result in the output string and adjust
4580 various state variables. Reallocate the output string if not enough
4581 space is available. Return a new reference to the object that
4582 was put in the output buffer, or Py_None, if the mapping was undefined
4583 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004584 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004586charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004587 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004589 PyObject *rep;
4590 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004591 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592
Christian Heimese93237d2007-12-19 02:37:44 +00004593 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004594 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004595 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004596 if (res == -1)
4597 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004598 if (outsize<requiredsize)
4599 if (!charmapencode_resize(outobj, outpos, requiredsize))
4600 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004601 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004602 outstart[(*outpos)++] = (char)res;
4603 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004604 }
4605
4606 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004608 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004609 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004610 Py_DECREF(rep);
4611 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004612 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004613 if (PyInt_Check(rep)) {
4614 Py_ssize_t requiredsize = *outpos+1;
4615 if (outsize<requiredsize)
4616 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4617 Py_DECREF(rep);
4618 return enc_EXCEPTION;
4619 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004620 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004621 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004622 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004623 else {
4624 const char *repchars = PyString_AS_STRING(rep);
4625 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4626 Py_ssize_t requiredsize = *outpos+repsize;
4627 if (outsize<requiredsize)
4628 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4629 Py_DECREF(rep);
4630 return enc_EXCEPTION;
4631 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004632 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004633 memcpy(outstart + *outpos, repchars, repsize);
4634 *outpos += repsize;
4635 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 }
Georg Brandl9f167602006-06-04 21:46:16 +00004637 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004638 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639}
4640
4641/* handle an error in PyUnicode_EncodeCharmap
4642 Return 0 on success, -1 on error */
4643static
4644int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004645 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004647 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004648 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649{
4650 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004651 Py_ssize_t repsize;
4652 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 Py_UNICODE *uni2;
4654 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004655 Py_ssize_t collstartpos = *inpos;
4656 Py_ssize_t collendpos = *inpos+1;
4657 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658 char *encoding = "charmap";
4659 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004660 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 /* find all unencodable characters */
4663 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004664 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004665 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004666 int res = encoding_map_lookup(p[collendpos], mapping);
4667 if (res != -1)
4668 break;
4669 ++collendpos;
4670 continue;
4671 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004672
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004673 rep = charmapencode_lookup(p[collendpos], mapping);
4674 if (rep==NULL)
4675 return -1;
4676 else if (rep!=Py_None) {
4677 Py_DECREF(rep);
4678 break;
4679 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004680 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004681 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 }
4683 /* cache callback name lookup
4684 * (if not done yet, i.e. it's the first error) */
4685 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004686 if ((errors==NULL) || (!strcmp(errors, "strict")))
4687 *known_errorHandler = 1;
4688 else if (!strcmp(errors, "replace"))
4689 *known_errorHandler = 2;
4690 else if (!strcmp(errors, "ignore"))
4691 *known_errorHandler = 3;
4692 else if (!strcmp(errors, "xmlcharrefreplace"))
4693 *known_errorHandler = 4;
4694 else
4695 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 }
4697 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004698 case 1: /* strict */
4699 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4700 return -1;
4701 case 2: /* replace */
4702 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004703 x = charmapencode_output('?', mapping, res, respos);
4704 if (x==enc_EXCEPTION) {
4705 return -1;
4706 }
4707 else if (x==enc_FAILED) {
4708 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4709 return -1;
4710 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004711 }
4712 /* fall through */
4713 case 3: /* ignore */
4714 *inpos = collendpos;
4715 break;
4716 case 4: /* xmlcharrefreplace */
Serhiy Storchakae822b032013-08-06 16:56:26 +03004717 /* generate replacement */
4718 for (collpos = collstartpos; collpos < collendpos;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004719 char buffer[2+29+1+1];
4720 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03004721 Py_UCS4 ch = p[collpos++];
4722#ifndef Py_UNICODE_WIDE
4723 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4724 (collpos < collendpos) &&
4725 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4726 ch = ((((ch & 0x03FF) << 10) |
4727 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4728 }
4729#endif
4730 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004731 for (cp = buffer; *cp; ++cp) {
4732 x = charmapencode_output(*cp, mapping, res, respos);
4733 if (x==enc_EXCEPTION)
4734 return -1;
4735 else if (x==enc_FAILED) {
4736 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4737 return -1;
4738 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004739 }
4740 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004741 *inpos = collendpos;
4742 break;
4743 default:
4744 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004745 encoding, reason, p, size, exceptionObject,
4746 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004747 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004748 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004749 /* generate replacement */
4750 repsize = PyUnicode_GET_SIZE(repunicode);
4751 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004752 x = charmapencode_output(*uni2, mapping, res, respos);
4753 if (x==enc_EXCEPTION) {
4754 return -1;
4755 }
4756 else if (x==enc_FAILED) {
4757 Py_DECREF(repunicode);
4758 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4759 return -1;
4760 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004761 }
4762 *inpos = newpos;
4763 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004764 }
4765 return 0;
4766}
4767
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004769 Py_ssize_t size,
4770 PyObject *mapping,
4771 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 /* output object */
4774 PyObject *res = NULL;
4775 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004776 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004778 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004779 PyObject *errorHandler = NULL;
4780 PyObject *exc = NULL;
4781 /* the following variable is used for caching string comparisons
4782 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4783 * 3=ignore, 4=xmlcharrefreplace */
4784 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785
4786 /* Default to Latin-1 */
4787 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004788 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 /* allocate enough for a simple encoding without
4791 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004792 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004793 if (res == NULL)
4794 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004795 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004796 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004799 /* try to encode it */
4800 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4801 if (x==enc_EXCEPTION) /* error */
4802 goto onError;
4803 if (x==enc_FAILED) { /* unencodable character */
4804 if (charmap_encoding_error(p, size, &inpos, mapping,
4805 &exc,
4806 &known_errorHandler, &errorHandler, errors,
4807 &res, &respos)) {
4808 goto onError;
4809 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004810 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004811 else
4812 /* done with this character => adjust input position */
4813 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004817 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004818 if (_PyString_Resize(&res, respos))
4819 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 }
4821 Py_XDECREF(exc);
4822 Py_XDECREF(errorHandler);
4823 return res;
4824
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004825 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 Py_XDECREF(res);
4827 Py_XDECREF(exc);
4828 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 return NULL;
4830}
4831
4832PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004833 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834{
4835 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004836 PyErr_BadArgument();
4837 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 }
4839 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004840 PyUnicode_GET_SIZE(unicode),
4841 mapping,
4842 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843}
4844
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845/* create or adjust a UnicodeTranslateError */
4846static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004847 const Py_UNICODE *unicode, Py_ssize_t size,
4848 Py_ssize_t startpos, Py_ssize_t endpos,
4849 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004852 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004853 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 }
4855 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004856 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4857 goto onError;
4858 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4859 goto onError;
4860 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4861 goto onError;
4862 return;
4863 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02004864 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 }
4866}
4867
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868/* raises a UnicodeTranslateError */
4869static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004870 const Py_UNICODE *unicode, Py_ssize_t size,
4871 Py_ssize_t startpos, Py_ssize_t endpos,
4872 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004873{
4874 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004875 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004877 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878}
4879
4880/* error handling callback helper:
4881 build arguments, call the callback and check the arguments,
4882 put the result into newpos and return the replacement string, which
4883 has to be freed by the caller */
4884static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004885 PyObject **errorHandler,
4886 const char *reason,
4887 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4888 Py_ssize_t startpos, Py_ssize_t endpos,
4889 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004890{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004891 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892
Martin v. Löwis412fb672006-04-13 06:34:32 +00004893 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004894 PyObject *restuple;
4895 PyObject *resunicode;
4896
4897 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004898 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004900 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004901 }
4902
4903 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004904 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004906 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907
4908 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004909 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004911 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004912 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004913 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004914 Py_DECREF(restuple);
4915 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916 }
4917 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004918 &resunicode, &i_newpos)) {
4919 Py_DECREF(restuple);
4920 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004921 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004922 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004923 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004924 else
4925 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004926 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004927 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4928 Py_DECREF(restuple);
4929 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004930 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004931 Py_INCREF(resunicode);
4932 Py_DECREF(restuple);
4933 return resunicode;
4934}
4935
4936/* Lookup the character ch in the mapping and put the result in result,
4937 which must be decrefed by the caller.
4938 Return 0 on success, -1 on error */
4939static
4940int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4941{
4942 PyObject *w = PyInt_FromLong((long)c);
4943 PyObject *x;
4944
4945 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004946 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004947 x = PyObject_GetItem(mapping, w);
4948 Py_DECREF(w);
4949 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004950 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4951 /* No mapping found means: use 1:1 mapping. */
4952 PyErr_Clear();
4953 *result = NULL;
4954 return 0;
4955 } else
4956 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004957 }
4958 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004959 *result = x;
4960 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961 }
4962 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004963 long value = PyInt_AS_LONG(x);
4964 long max = PyUnicode_GetMax();
4965 if (value < 0 || value > max) {
4966 PyErr_Format(PyExc_TypeError,
4967 "character mapping must be in range(0x%lx)", max+1);
4968 Py_DECREF(x);
4969 return -1;
4970 }
4971 *result = x;
4972 return 0;
4973 }
4974 else if (PyUnicode_Check(x)) {
4975 *result = x;
4976 return 0;
4977 }
4978 else {
4979 /* wrong return value */
4980 PyErr_SetString(PyExc_TypeError,
4981 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004982 Py_DECREF(x);
4983 return -1;
4984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004985}
4986/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004987 if not reallocate and adjust various state variables.
4988 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989static
Walter Dörwald4894c302003-10-24 14:25:28 +00004990int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004991 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004993 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004994 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004995 /* remember old output position */
4996 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4997 /* exponentially overallocate to minimize reallocations */
4998 if (requiredsize < 2 * oldsize)
4999 requiredsize = 2 * oldsize;
5000 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5001 return -1;
5002 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005003 }
5004 return 0;
5005}
5006/* lookup the character, put the result in the output string and adjust
5007 various state variables. Return a new reference to the object that
5008 was put in the output buffer in *result, or Py_None, if the mapping was
5009 undefined (in which case no character was written).
5010 The called must decref result.
5011 Return 0 on success, -1 on error. */
5012static
Walter Dörwald4894c302003-10-24 14:25:28 +00005013int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005014 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5015 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005016{
Walter Dörwald4894c302003-10-24 14:25:28 +00005017 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005018 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005019 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005020 /* not found => default to 1:1 mapping */
5021 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 }
5023 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005024 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005026 /* no overflow check, because we know that the space is enough */
5027 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028 }
5029 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005030 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5031 if (repsize==1) {
5032 /* no overflow check, because we know that the space is enough */
5033 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5034 }
5035 else if (repsize!=0) {
5036 /* more than one character */
5037 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5038 (insize - (curinp-startinp)) +
5039 repsize - 1;
5040 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5041 return -1;
5042 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5043 *outp += repsize;
5044 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 }
5046 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005047 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005048 return 0;
5049}
5050
5051PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005052 Py_ssize_t size,
5053 PyObject *mapping,
5054 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005056 /* output object */
5057 PyObject *res = NULL;
5058 /* pointers to the beginning and end+1 of input */
5059 const Py_UNICODE *startp = p;
5060 const Py_UNICODE *endp = p + size;
5061 /* pointer into the output */
5062 Py_UNICODE *str;
5063 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005064 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065 char *reason = "character maps to <undefined>";
5066 PyObject *errorHandler = NULL;
5067 PyObject *exc = NULL;
5068 /* the following variable is used for caching string comparisons
5069 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5070 * 3=ignore, 4=xmlcharrefreplace */
5071 int known_errorHandler = -1;
5072
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005074 PyErr_BadArgument();
5075 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005077
5078 /* allocate enough for a simple 1:1 translation without
5079 replacements, if we need more, we'll resize */
5080 res = PyUnicode_FromUnicode(NULL, size);
5081 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005082 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005084 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005085 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005087 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005088 /* try to encode it */
5089 PyObject *x = NULL;
5090 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5091 Py_XDECREF(x);
5092 goto onError;
5093 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005094 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005095 if (x!=Py_None) /* it worked => adjust input pointer */
5096 ++p;
5097 else { /* untranslatable character */
5098 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5099 Py_ssize_t repsize;
5100 Py_ssize_t newpos;
5101 Py_UNICODE *uni2;
5102 /* startpos for collecting untranslatable chars */
5103 const Py_UNICODE *collstart = p;
5104 const Py_UNICODE *collend = p+1;
5105 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005107 /* find all untranslatable characters */
5108 while (collend < endp) {
5109 if (charmaptranslate_lookup(*collend, mapping, &x))
5110 goto onError;
5111 Py_XDECREF(x);
5112 if (x!=Py_None)
5113 break;
5114 ++collend;
5115 }
5116 /* cache callback name lookup
5117 * (if not done yet, i.e. it's the first error) */
5118 if (known_errorHandler==-1) {
5119 if ((errors==NULL) || (!strcmp(errors, "strict")))
5120 known_errorHandler = 1;
5121 else if (!strcmp(errors, "replace"))
5122 known_errorHandler = 2;
5123 else if (!strcmp(errors, "ignore"))
5124 known_errorHandler = 3;
5125 else if (!strcmp(errors, "xmlcharrefreplace"))
5126 known_errorHandler = 4;
5127 else
5128 known_errorHandler = 0;
5129 }
5130 switch (known_errorHandler) {
5131 case 1: /* strict */
5132 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005133 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005134 case 2: /* replace */
5135 /* No need to check for space, this is a 1:1 replacement */
5136 for (coll = collstart; coll<collend; ++coll)
5137 *str++ = '?';
5138 /* fall through */
5139 case 3: /* ignore */
5140 p = collend;
5141 break;
5142 case 4: /* xmlcharrefreplace */
5143 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005144 for (p = collstart; p < collend;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005145 char buffer[2+29+1+1];
5146 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03005147 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5148 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005149 if (charmaptranslate_makespace(&res, &str,
5150 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5151 goto onError;
5152 for (cp = buffer; *cp; ++cp)
5153 *str++ = *cp;
5154 }
5155 p = collend;
5156 break;
5157 default:
5158 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5159 reason, startp, size, &exc,
5160 collstart-startp, collend-startp, &newpos);
5161 if (repunicode == NULL)
5162 goto onError;
5163 /* generate replacement */
5164 repsize = PyUnicode_GET_SIZE(repunicode);
5165 if (charmaptranslate_makespace(&res, &str,
5166 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5167 Py_DECREF(repunicode);
5168 goto onError;
5169 }
5170 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5171 *str++ = *uni2;
5172 p = startp + newpos;
5173 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005174 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005175 }
5176 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005177 /* Resize if we allocated to much */
5178 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005179 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005180 if (PyUnicode_Resize(&res, respos) < 0)
5181 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005182 }
5183 Py_XDECREF(exc);
5184 Py_XDECREF(errorHandler);
5185 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005187 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005188 Py_XDECREF(res);
5189 Py_XDECREF(exc);
5190 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 return NULL;
5192}
5193
5194PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005195 PyObject *mapping,
5196 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197{
5198 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005199
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 str = PyUnicode_FromObject(str);
5201 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005202 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005204 PyUnicode_GET_SIZE(str),
5205 mapping,
5206 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 Py_DECREF(str);
5208 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005209
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005210 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 Py_XDECREF(str);
5212 return NULL;
5213}
Tim Petersced69f82003-09-16 20:30:58 +00005214
Guido van Rossum9e896b32000-04-05 20:11:21 +00005215/* --- Decimal Encoder ---------------------------------------------------- */
5216
5217int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005218 Py_ssize_t length,
5219 char *output,
5220 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005221{
5222 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005223 PyObject *errorHandler = NULL;
5224 PyObject *exc = NULL;
5225 const char *encoding = "decimal";
5226 const char *reason = "invalid decimal Unicode string";
5227 /* the following variable is used for caching string comparisons
5228 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5229 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005230
5231 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005232 PyErr_BadArgument();
5233 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005234 }
5235
5236 p = s;
5237 end = s + length;
5238 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005239 register Py_UNICODE ch = *p;
5240 int decimal;
5241 PyObject *repunicode;
5242 Py_ssize_t repsize;
5243 Py_ssize_t newpos;
5244 Py_UNICODE *uni2;
5245 Py_UNICODE *collstart;
5246 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005247
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005248 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005249 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005250 ++p;
5251 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005252 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005253 decimal = Py_UNICODE_TODECIMAL(ch);
5254 if (decimal >= 0) {
5255 *output++ = '0' + decimal;
5256 ++p;
5257 continue;
5258 }
5259 if (0 < ch && ch < 256) {
5260 *output++ = (char)ch;
5261 ++p;
5262 continue;
5263 }
5264 /* All other characters are considered unencodable */
5265 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005266 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005267 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005268 Py_UNICODE_ISSPACE(*collend) ||
5269 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005270 break;
5271 }
5272 /* cache callback name lookup
5273 * (if not done yet, i.e. it's the first error) */
5274 if (known_errorHandler==-1) {
5275 if ((errors==NULL) || (!strcmp(errors, "strict")))
5276 known_errorHandler = 1;
5277 else if (!strcmp(errors, "replace"))
5278 known_errorHandler = 2;
5279 else if (!strcmp(errors, "ignore"))
5280 known_errorHandler = 3;
5281 else if (!strcmp(errors, "xmlcharrefreplace"))
5282 known_errorHandler = 4;
5283 else
5284 known_errorHandler = 0;
5285 }
5286 switch (known_errorHandler) {
5287 case 1: /* strict */
5288 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5289 goto onError;
5290 case 2: /* replace */
5291 for (p = collstart; p < collend; ++p)
5292 *output++ = '?';
5293 /* fall through */
5294 case 3: /* ignore */
5295 p = collend;
5296 break;
5297 case 4: /* xmlcharrefreplace */
5298 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005299 for (p = collstart; p < collend;) {
5300 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5301 output += sprintf(output, "&#%d;", ch);
5302 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005303 p = collend;
5304 break;
5305 default:
5306 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5307 encoding, reason, s, length, &exc,
5308 collstart-s, collend-s, &newpos);
5309 if (repunicode == NULL)
5310 goto onError;
5311 /* generate replacement */
5312 repsize = PyUnicode_GET_SIZE(repunicode);
5313 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5314 Py_UNICODE ch = *uni2;
5315 if (Py_UNICODE_ISSPACE(ch))
5316 *output++ = ' ';
5317 else {
5318 decimal = Py_UNICODE_TODECIMAL(ch);
5319 if (decimal >= 0)
5320 *output++ = '0' + decimal;
5321 else if (0 < ch && ch < 256)
5322 *output++ = (char)ch;
5323 else {
5324 Py_DECREF(repunicode);
5325 raise_encode_exception(&exc, encoding,
5326 s, length, collstart-s, collend-s, reason);
5327 goto onError;
5328 }
5329 }
5330 }
5331 p = s + newpos;
5332 Py_DECREF(repunicode);
5333 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005334 }
5335 /* 0-terminate the output string */
5336 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337 Py_XDECREF(exc);
5338 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005339 return 0;
5340
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005341 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005342 Py_XDECREF(exc);
5343 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005344 return -1;
5345}
5346
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347/* --- Helpers ------------------------------------------------------------ */
5348
Eric Smitha9f7d622008-02-17 19:46:49 +00005349#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005350#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005351
5352#include "stringlib/count.h"
5353#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005354#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005355#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005356
Fredrik Lundhc8162812006-05-26 19:33:03 +00005357/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005358#define ADJUST_INDICES(start, end, len) \
5359 if (end > len) \
5360 end = len; \
5361 else if (end < 0) { \
5362 end += len; \
5363 if (end < 0) \
5364 end = 0; \
5365 } \
5366 if (start < 0) { \
5367 start += len; \
5368 if (start < 0) \
5369 start = 0; \
5370 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005371
Martin v. Löwis18e16552006-02-15 17:27:45 +00005372Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005373 PyObject *substr,
5374 Py_ssize_t start,
5375 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005377 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005378 PyUnicodeObject* str_obj;
5379 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005380
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005381 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5382 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005383 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005384 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5385 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005386 Py_DECREF(str_obj);
5387 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 }
Tim Petersced69f82003-09-16 20:30:58 +00005389
Antoine Pitrou64672132010-01-13 07:55:48 +00005390 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005391 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005392 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5393 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005394 );
5395
5396 Py_DECREF(sub_obj);
5397 Py_DECREF(str_obj);
5398
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 return result;
5400}
5401
Martin v. Löwis18e16552006-02-15 17:27:45 +00005402Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005403 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005404 Py_ssize_t start,
5405 Py_ssize_t end,
5406 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005408 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005409
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005410 str = PyUnicode_FromObject(str);
5411 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005412 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005413 sub = PyUnicode_FromObject(sub);
5414 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005415 Py_DECREF(str);
5416 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 }
Tim Petersced69f82003-09-16 20:30:58 +00005418
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005419 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005420 result = stringlib_find_slice(
5421 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5422 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5423 start, end
5424 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005425 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005426 result = stringlib_rfind_slice(
5427 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5428 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5429 start, end
5430 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005431
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005432 Py_DECREF(str);
5433 Py_DECREF(sub);
5434
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 return result;
5436}
5437
Tim Petersced69f82003-09-16 20:30:58 +00005438static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005440 PyUnicodeObject *substring,
5441 Py_ssize_t start,
5442 Py_ssize_t end,
5443 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 if (substring->length == 0)
5446 return 1;
5447
Antoine Pitrou64672132010-01-13 07:55:48 +00005448 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 end -= substring->length;
5450 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005451 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452
5453 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005454 if (Py_UNICODE_MATCH(self, end, substring))
5455 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 } else {
5457 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005458 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 }
5460
5461 return 0;
5462}
5463
Martin v. Löwis18e16552006-02-15 17:27:45 +00005464Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005465 PyObject *substr,
5466 Py_ssize_t start,
5467 Py_ssize_t end,
5468 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005470 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005471
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 str = PyUnicode_FromObject(str);
5473 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005474 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 substr = PyUnicode_FromObject(substr);
5476 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005477 Py_DECREF(str);
5478 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 }
Tim Petersced69f82003-09-16 20:30:58 +00005480
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005482 (PyUnicodeObject *)substr,
5483 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 Py_DECREF(str);
5485 Py_DECREF(substr);
5486 return result;
5487}
5488
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489/* Apply fixfct filter to the Unicode object self and return a
5490 reference to the modified object */
5491
Tim Petersced69f82003-09-16 20:30:58 +00005492static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005494 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495{
5496
5497 PyUnicodeObject *u;
5498
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005499 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005501 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005502
5503 Py_UNICODE_COPY(u->str, self->str, self->length);
5504
Tim Peters7a29bd52001-09-12 03:03:31 +00005505 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005506 /* fixfct should return TRUE if it modified the buffer. If
5507 FALSE, return a reference to the original buffer instead
5508 (to save space, not time) */
5509 Py_INCREF(self);
5510 Py_DECREF(u);
5511 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 }
5513 return (PyObject*) u;
5514}
5515
Tim Petersced69f82003-09-16 20:30:58 +00005516static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517int fixupper(PyUnicodeObject *self)
5518{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005519 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 Py_UNICODE *s = self->str;
5521 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005522
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005524 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005525
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005526 ch = Py_UNICODE_TOUPPER(*s);
5527 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005529 *s = ch;
5530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 s++;
5532 }
5533
5534 return status;
5535}
5536
Tim Petersced69f82003-09-16 20:30:58 +00005537static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538int fixlower(PyUnicodeObject *self)
5539{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005540 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 Py_UNICODE *s = self->str;
5542 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005543
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005545 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005546
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005547 ch = Py_UNICODE_TOLOWER(*s);
5548 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005550 *s = ch;
5551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 s++;
5553 }
5554
5555 return status;
5556}
5557
Tim Petersced69f82003-09-16 20:30:58 +00005558static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559int fixswapcase(PyUnicodeObject *self)
5560{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005561 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 Py_UNICODE *s = self->str;
5563 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005564
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 while (len-- > 0) {
5566 if (Py_UNICODE_ISUPPER(*s)) {
5567 *s = Py_UNICODE_TOLOWER(*s);
5568 status = 1;
5569 } else if (Py_UNICODE_ISLOWER(*s)) {
5570 *s = Py_UNICODE_TOUPPER(*s);
5571 status = 1;
5572 }
5573 s++;
5574 }
5575
5576 return status;
5577}
5578
Tim Petersced69f82003-09-16 20:30:58 +00005579static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580int fixcapitalize(PyUnicodeObject *self)
5581{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005582 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005583 Py_UNICODE *s = self->str;
5584 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005585
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005586 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005587 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005588 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005589 *s = Py_UNICODE_TOUPPER(*s);
5590 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005592 s++;
5593 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005594 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005595 *s = Py_UNICODE_TOLOWER(*s);
5596 status = 1;
5597 }
5598 s++;
5599 }
5600 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601}
5602
5603static
5604int fixtitle(PyUnicodeObject *self)
5605{
5606 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5607 register Py_UNICODE *e;
5608 int previous_is_cased;
5609
5610 /* Shortcut for single character strings */
5611 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005612 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5613 if (*p != ch) {
5614 *p = ch;
5615 return 1;
5616 }
5617 else
5618 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 }
Tim Petersced69f82003-09-16 20:30:58 +00005620
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 e = p + PyUnicode_GET_SIZE(self);
5622 previous_is_cased = 0;
5623 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005624 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005625
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005626 if (previous_is_cased)
5627 *p = Py_UNICODE_TOLOWER(ch);
5628 else
5629 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005630
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005631 if (Py_UNICODE_ISLOWER(ch) ||
5632 Py_UNICODE_ISUPPER(ch) ||
5633 Py_UNICODE_ISTITLE(ch))
5634 previous_is_cased = 1;
5635 else
5636 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 }
5638 return 1;
5639}
5640
Tim Peters8ce9f162004-08-27 01:49:32 +00005641PyObject *
5642PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643{
Tim Peters8ce9f162004-08-27 01:49:32 +00005644 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005645 const Py_UNICODE blank = ' ';
5646 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005647 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005648 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005649 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5650 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005651 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5652 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005653 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005654 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005655 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
Benjamin Peterson7a91bf82014-02-15 13:02:52 -05005657 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00005658 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005659 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005660 }
5661
Tim Peters91879ab2004-08-27 22:35:44 +00005662 /* Grrrr. A codec may be invoked to convert str objects to
5663 * Unicode, and so it's possible to call back into Python code
5664 * during PyUnicode_FromObject(), and so it's possible for a sick
5665 * codec to change the size of fseq (if seq is a list). Therefore
5666 * we have to keep refetching the size -- can't assume seqlen
5667 * is invariant.
5668 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005669 seqlen = PySequence_Fast_GET_SIZE(fseq);
5670 /* If empty sequence, return u"". */
5671 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005672 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5673 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005674 }
5675 /* If singleton sequence with an exact Unicode, return that. */
5676 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005677 item = PySequence_Fast_GET_ITEM(fseq, 0);
5678 if (PyUnicode_CheckExact(item)) {
5679 Py_INCREF(item);
5680 res = (PyUnicodeObject *)item;
5681 goto Done;
5682 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005683 }
5684
Tim Peters05eba1f2004-08-27 21:32:02 +00005685 /* At least two items to join, or one that isn't exact Unicode. */
5686 if (seqlen > 1) {
5687 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005688 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005689 sep = &blank;
5690 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005691 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005692 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005693 internal_separator = PyUnicode_FromObject(separator);
5694 if (internal_separator == NULL)
5695 goto onError;
5696 sep = PyUnicode_AS_UNICODE(internal_separator);
5697 seplen = PyUnicode_GET_SIZE(internal_separator);
5698 /* In case PyUnicode_FromObject() mutated seq. */
5699 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005700 }
5701 }
5702
5703 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005704 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005705 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005706 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005707 res_p = PyUnicode_AS_UNICODE(res);
5708 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005709
Tim Peters05eba1f2004-08-27 21:32:02 +00005710 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005711 Py_ssize_t itemlen;
5712 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005713
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005714 item = PySequence_Fast_GET_ITEM(fseq, i);
5715 /* Convert item to Unicode. */
5716 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5717 PyErr_Format(PyExc_TypeError,
5718 "sequence item %zd: expected string or Unicode,"
5719 " %.80s found",
5720 i, Py_TYPE(item)->tp_name);
5721 goto onError;
5722 }
5723 item = PyUnicode_FromObject(item);
5724 if (item == NULL)
5725 goto onError;
5726 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005727
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005728 /* In case PyUnicode_FromObject() mutated seq. */
5729 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005730
Tim Peters8ce9f162004-08-27 01:49:32 +00005731 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005732 itemlen = PyUnicode_GET_SIZE(item);
5733 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005734 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005735 goto Overflow;
5736 if (i < seqlen - 1) {
5737 new_res_used += seplen;
5738 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005739 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005740 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005741 if (new_res_used > res_alloc) {
5742 /* double allocated size until it's big enough */
5743 do {
5744 res_alloc += res_alloc;
5745 if (res_alloc <= 0)
5746 goto Overflow;
5747 } while (new_res_used > res_alloc);
5748 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5749 Py_DECREF(item);
5750 goto onError;
5751 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005752 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005753 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005754
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005755 /* Copy item, and maybe the separator. */
5756 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5757 res_p += itemlen;
5758 if (i < seqlen - 1) {
5759 Py_UNICODE_COPY(res_p, sep, seplen);
5760 res_p += seplen;
5761 }
5762 Py_DECREF(item);
5763 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005764 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005765
Tim Peters05eba1f2004-08-27 21:32:02 +00005766 /* Shrink res to match the used area; this probably can't fail,
5767 * but it's cheap to check.
5768 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005769 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005770 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005771
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005772 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005773 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005774 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 return (PyObject *)res;
5776
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005777 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005778 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005779 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005780 Py_DECREF(item);
5781 /* fall through */
5782
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005783 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005784 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005785 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005786 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 return NULL;
5788}
5789
Tim Petersced69f82003-09-16 20:30:58 +00005790static
5791PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005792 Py_ssize_t left,
5793 Py_ssize_t right,
5794 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795{
5796 PyUnicodeObject *u;
5797
5798 if (left < 0)
5799 left = 0;
5800 if (right < 0)
5801 right = 0;
5802
Tim Peters7a29bd52001-09-12 03:03:31 +00005803 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 Py_INCREF(self);
5805 return self;
5806 }
5807
Neal Norwitze7d8be82008-07-31 17:17:14 +00005808 if (left > PY_SSIZE_T_MAX - self->length ||
5809 right > PY_SSIZE_T_MAX - (left + self->length)) {
5810 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5811 return NULL;
5812 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 u = _PyUnicode_New(left + self->length + right);
5814 if (u) {
5815 if (left)
5816 Py_UNICODE_FILL(u->str, fill, left);
5817 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5818 if (right)
5819 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5820 }
5821
5822 return u;
5823}
5824
Antoine Pitrou64672132010-01-13 07:55:48 +00005825PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828
5829 string = PyUnicode_FromObject(string);
5830 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005831 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832
Antoine Pitrou64672132010-01-13 07:55:48 +00005833 list = stringlib_splitlines(
5834 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5835 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836
5837 Py_DECREF(string);
5838 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839}
5840
Tim Petersced69f82003-09-16 20:30:58 +00005841static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005843 PyUnicodeObject *substring,
5844 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005847 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005850 return stringlib_split_whitespace(
5851 (PyObject*) self, self->str, self->length, maxcount
5852 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853
Antoine Pitrou64672132010-01-13 07:55:48 +00005854 return stringlib_split(
5855 (PyObject*) self, self->str, self->length,
5856 substring->str, substring->length,
5857 maxcount
5858 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859}
5860
Tim Petersced69f82003-09-16 20:30:58 +00005861static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005862PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005863 PyUnicodeObject *substring,
5864 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005865{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005866 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005867 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005868
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005869 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005870 return stringlib_rsplit_whitespace(
5871 (PyObject*) self, self->str, self->length, maxcount
5872 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005873
Antoine Pitrou64672132010-01-13 07:55:48 +00005874 return stringlib_rsplit(
5875 (PyObject*) self, self->str, self->length,
5876 substring->str, substring->length,
5877 maxcount
5878 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005879}
5880
5881static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005883 PyUnicodeObject *str1,
5884 PyUnicodeObject *str2,
5885 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886{
5887 PyUnicodeObject *u;
5888
5889 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005890 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005891 else if (maxcount == 0 || self->length == 0)
5892 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893
Fredrik Lundh347ee272006-05-24 16:35:18 +00005894 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005895 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005896 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005897 if (str1->length == 0)
5898 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005899 if (str1->length == 1) {
5900 /* replace characters */
5901 Py_UNICODE u1, u2;
5902 if (!findchar(self->str, self->length, str1->str[0]))
5903 goto nothing;
5904 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5905 if (!u)
5906 return NULL;
5907 Py_UNICODE_COPY(u->str, self->str, self->length);
5908 u1 = str1->str[0];
5909 u2 = str2->str[0];
5910 for (i = 0; i < u->length; i++)
5911 if (u->str[i] == u1) {
5912 if (--maxcount < 0)
5913 break;
5914 u->str[i] = u2;
5915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005917 i = stringlib_find(
5918 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005920 if (i < 0)
5921 goto nothing;
5922 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5923 if (!u)
5924 return NULL;
5925 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005926
5927 /* change everything in-place, starting with this one */
5928 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5929 i += str1->length;
5930
5931 while ( --maxcount > 0) {
5932 i = stringlib_find(self->str+i, self->length-i,
5933 str1->str, str1->length,
5934 i);
5935 if (i == -1)
5936 break;
5937 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5938 i += str1->length;
5939 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005942
Brett Cannona7f13ee2010-05-04 01:16:51 +00005943 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005944 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 Py_UNICODE *p;
5946
5947 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005948 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5949 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005950 if (n == 0)
5951 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005952 /* new_size = self->length + n * (str2->length - str1->length)); */
5953 delta = (str2->length - str1->length);
5954 if (delta == 0) {
5955 new_size = self->length;
5956 } else {
5957 product = n * (str2->length - str1->length);
5958 if ((product / (str2->length - str1->length)) != n) {
5959 PyErr_SetString(PyExc_OverflowError,
5960 "replace string is too long");
5961 return NULL;
5962 }
5963 new_size = self->length + product;
5964 if (new_size < 0) {
5965 PyErr_SetString(PyExc_OverflowError,
5966 "replace string is too long");
5967 return NULL;
5968 }
5969 }
5970 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005971 if (!u)
5972 return NULL;
5973 i = 0;
5974 p = u->str;
5975 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005976 while (n-- > 0) {
5977 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005978 j = stringlib_find(self->str+i, self->length-i,
5979 str1->str, str1->length,
5980 i);
5981 if (j == -1)
5982 break;
5983 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005984 /* copy unchanged part [i:j] */
5985 Py_UNICODE_COPY(p, self->str+i, j-i);
5986 p += j - i;
5987 }
5988 /* copy substitution string */
5989 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005990 Py_UNICODE_COPY(p, str2->str, str2->length);
5991 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005992 }
5993 i = j + str1->length;
5994 }
5995 if (i < self->length)
5996 /* copy tail [i:] */
5997 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005998 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005999 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006000 while (n > 0) {
6001 Py_UNICODE_COPY(p, str2->str, str2->length);
6002 p += str2->length;
6003 if (--n <= 0)
6004 break;
6005 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006007 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 }
6009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006011
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006012 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006013 /* nothing to replace; return original string (when possible) */
6014 if (PyUnicode_CheckExact(self)) {
6015 Py_INCREF(self);
6016 return (PyObject *) self;
6017 }
6018 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019}
6020
6021/* --- Unicode Object Methods --------------------------------------------- */
6022
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006023PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006024 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025\n\
6026Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006027characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028
6029static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006030unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 return fixup(self, fixtitle);
6033}
6034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006035PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006036 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037\n\
6038Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00006039have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040
6041static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006042unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 return fixup(self, fixcapitalize);
6045}
6046
6047#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006048PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006049 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050\n\
6051Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006052normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053
6054static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006055unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056{
6057 PyObject *list;
6058 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006059 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 /* Split into words */
6062 list = split(self, NULL, -1);
6063 if (!list)
6064 return NULL;
6065
6066 /* Capitalize each word */
6067 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6068 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006069 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 if (item == NULL)
6071 goto onError;
6072 Py_DECREF(PyList_GET_ITEM(list, i));
6073 PyList_SET_ITEM(list, i, item);
6074 }
6075
6076 /* Join the words to form a new string */
6077 item = PyUnicode_Join(NULL, list);
6078
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006079 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 Py_DECREF(list);
6081 return (PyObject *)item;
6082}
6083#endif
6084
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006085/* Argument converter. Coerces to a single unicode character */
6086
6087static int
6088convert_uc(PyObject *obj, void *addr)
6089{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006090 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6091 PyObject *uniobj;
6092 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006093
Benjamin Peterson857ce152009-01-31 16:29:18 +00006094 uniobj = PyUnicode_FromObject(obj);
6095 if (uniobj == NULL) {
6096 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006097 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006098 return 0;
6099 }
6100 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6101 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006102 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006103 Py_DECREF(uniobj);
6104 return 0;
6105 }
6106 unistr = PyUnicode_AS_UNICODE(uniobj);
6107 *fillcharloc = unistr[0];
6108 Py_DECREF(uniobj);
6109 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006110}
6111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006112PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006113 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006115Return S centered in a Unicode string of length width. Padding is\n\
6116done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117
6118static PyObject *
6119unicode_center(PyUnicodeObject *self, PyObject *args)
6120{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006121 Py_ssize_t marg, left;
6122 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006123 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124
Thomas Woutersde017742006-02-16 19:34:37 +00006125 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 return NULL;
6127
Tim Peters7a29bd52001-09-12 03:03:31 +00006128 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 Py_INCREF(self);
6130 return (PyObject*) self;
6131 }
6132
6133 marg = width - self->length;
6134 left = marg / 2 + (marg & width & 1);
6135
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006136 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137}
6138
Marc-André Lemburge5034372000-08-08 08:04:29 +00006139#if 0
6140
6141/* This code should go into some future Unicode collation support
6142 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006143 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006144
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006145/* speedy UTF-16 code point order comparison */
6146/* gleaned from: */
6147/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6148
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006149static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006150{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006151 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006152 0, 0, 0, 0, 0, 0, 0, 0,
6153 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006154 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006155};
6156
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157static int
6158unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6159{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006160 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006161
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 Py_UNICODE *s1 = str1->str;
6163 Py_UNICODE *s2 = str2->str;
6164
6165 len1 = str1->length;
6166 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006167
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006169 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006170
6171 c1 = *s1++;
6172 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006173
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006174 if (c1 > (1<<11) * 26)
6175 c1 += utf16Fixup[c1>>11];
6176 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006177 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006178 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006179
6180 if (c1 != c2)
6181 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006182
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006183 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 }
6185
6186 return (len1 < len2) ? -1 : (len1 != len2);
6187}
6188
Marc-André Lemburge5034372000-08-08 08:04:29 +00006189#else
6190
6191static int
6192unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6193{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006194 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006195
6196 Py_UNICODE *s1 = str1->str;
6197 Py_UNICODE *s2 = str2->str;
6198
6199 len1 = str1->length;
6200 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006201
Marc-André Lemburge5034372000-08-08 08:04:29 +00006202 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006203 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006204
Fredrik Lundh45714e92001-06-26 16:39:36 +00006205 c1 = *s1++;
6206 c2 = *s2++;
6207
6208 if (c1 != c2)
6209 return (c1 < c2) ? -1 : 1;
6210
Marc-André Lemburge5034372000-08-08 08:04:29 +00006211 len1--; len2--;
6212 }
6213
6214 return (len1 < len2) ? -1 : (len1 != len2);
6215}
6216
6217#endif
6218
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006220 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221{
6222 PyUnicodeObject *u = NULL, *v = NULL;
6223 int result;
6224
6225 /* Coerce the two arguments */
6226 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6227 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006228 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6230 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006231 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232
Thomas Wouters7e474022000-07-16 12:04:32 +00006233 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006235 Py_DECREF(u);
6236 Py_DECREF(v);
6237 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 }
6239
6240 result = unicode_compare(u, v);
6241
6242 Py_DECREF(u);
6243 Py_DECREF(v);
6244 return result;
6245
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006246 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 Py_XDECREF(u);
6248 Py_XDECREF(v);
6249 return -1;
6250}
6251
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006252PyObject *PyUnicode_RichCompare(PyObject *left,
6253 PyObject *right,
6254 int op)
6255{
6256 int result;
6257
6258 result = PyUnicode_Compare(left, right);
6259 if (result == -1 && PyErr_Occurred())
6260 goto onError;
6261
6262 /* Convert the return value to a Boolean */
6263 switch (op) {
6264 case Py_EQ:
6265 result = (result == 0);
6266 break;
6267 case Py_NE:
6268 result = (result != 0);
6269 break;
6270 case Py_LE:
6271 result = (result <= 0);
6272 break;
6273 case Py_GE:
6274 result = (result >= 0);
6275 break;
6276 case Py_LT:
6277 result = (result == -1);
6278 break;
6279 case Py_GT:
6280 result = (result == 1);
6281 break;
6282 }
6283 return PyBool_FromLong(result);
6284
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006285 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006286
6287 /* Standard case
6288
6289 Type errors mean that PyUnicode_FromObject() could not convert
6290 one of the arguments (usually the right hand side) to Unicode,
6291 ie. we can't handle the comparison request. However, it is
6292 possible that the other object knows a comparison method, which
6293 is why we return Py_NotImplemented to give the other object a
6294 chance.
6295
6296 */
6297 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6298 PyErr_Clear();
6299 Py_INCREF(Py_NotImplemented);
6300 return Py_NotImplemented;
6301 }
6302 if (op != Py_EQ && op != Py_NE)
6303 return NULL;
6304
6305 /* Equality comparison.
6306
6307 This is a special case: we silence any PyExc_UnicodeDecodeError
6308 and instead turn it into a PyErr_UnicodeWarning.
6309
6310 */
6311 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6312 return NULL;
6313 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006314 if (PyErr_Warn(PyExc_UnicodeWarning,
6315 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006316 "Unicode equal comparison "
6317 "failed to convert both arguments to Unicode - "
6318 "interpreting them as being unequal" :
6319 "Unicode unequal comparison "
6320 "failed to convert both arguments to Unicode - "
6321 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006322 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006323 return NULL;
6324 result = (op == Py_NE);
6325 return PyBool_FromLong(result);
6326}
6327
Guido van Rossum403d68b2000-03-13 15:55:09 +00006328int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006329 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006330{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006331 PyObject *str, *sub;
6332 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006333
6334 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006335 sub = PyUnicode_FromObject(element);
6336 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006337 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006338 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006339
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006340 str = PyUnicode_FromObject(container);
6341 if (!str) {
6342 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006343 return -1;
6344 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006345
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006346 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006347
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006348 Py_DECREF(str);
6349 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006350
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006351 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006352}
6353
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354/* Concat to string or Unicode object giving a new Unicode object. */
6355
6356PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006357 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358{
6359 PyUnicodeObject *u = NULL, *v = NULL, *w;
6360
6361 /* Coerce the two arguments */
6362 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6363 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006364 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6366 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006367 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368
6369 /* Shortcuts */
6370 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006371 Py_DECREF(v);
6372 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 }
6374 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006375 Py_DECREF(u);
6376 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 }
6378
6379 /* Concat the two Unicode strings */
6380 w = _PyUnicode_New(u->length + v->length);
6381 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006382 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 Py_UNICODE_COPY(w->str, u->str, u->length);
6384 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6385
6386 Py_DECREF(u);
6387 Py_DECREF(v);
6388 return (PyObject *)w;
6389
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006390 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 Py_XDECREF(u);
6392 Py_XDECREF(v);
6393 return NULL;
6394}
6395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006396PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006397 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006399Return the number of non-overlapping occurrences of substring sub in\n\
6400Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006401interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402
6403static PyObject *
6404unicode_count(PyUnicodeObject *self, PyObject *args)
6405{
6406 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006407 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006408 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 PyObject *result;
6410
Jesus Cea44e81682011-04-20 16:39:15 +02006411 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6412 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006413 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006414
Antoine Pitrou64672132010-01-13 07:55:48 +00006415 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006416 result = PyInt_FromSsize_t(
6417 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006418 substring->str, substring->length,
6419 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006420 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421
6422 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006423
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 return result;
6425}
6426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006427PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006428 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006430Encodes S using the codec registered for encoding. encoding defaults\n\
6431to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006432handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6434'xmlcharrefreplace' as well as any other name registered with\n\
6435codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436
6437static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006438unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006440 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 char *encoding = NULL;
6442 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006443 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006444
Benjamin Peterson332d7212009-09-18 21:14:55 +00006445 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6446 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006448 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006449 if (v == NULL)
6450 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006451 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006452 PyErr_Format(PyExc_TypeError,
6453 "encoder did not return a string/unicode object "
6454 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006455 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006456 Py_DECREF(v);
6457 return NULL;
6458 }
6459 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006460
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006461 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006462 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006463}
6464
6465PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006466 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006467\n\
6468Decodes S using the codec registered for encoding. encoding defaults\n\
6469to the default encoding. errors may be given to set a different error\n\
6470handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6471a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006472as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006473able to handle UnicodeDecodeErrors.");
6474
6475static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006476unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006477{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006478 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006479 char *encoding = NULL;
6480 char *errors = NULL;
6481 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006482
Benjamin Peterson332d7212009-09-18 21:14:55 +00006483 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6484 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006485 return NULL;
6486 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006487 if (v == NULL)
6488 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006489 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006490 PyErr_Format(PyExc_TypeError,
6491 "decoder did not return a string/unicode object "
6492 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006493 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006494 Py_DECREF(v);
6495 return NULL;
6496 }
6497 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006498
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006499 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501}
6502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006503PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006504 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505\n\
6506Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006507If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508
6509static PyObject*
6510unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6511{
6512 Py_UNICODE *e;
6513 Py_UNICODE *p;
6514 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006515 Py_UNICODE *qe;
6516 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 PyUnicodeObject *u;
6518 int tabsize = 8;
6519
6520 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522
Thomas Wouters7e474022000-07-16 12:04:32 +00006523 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006524 i = 0; /* chars up to and including most recent \n or \r */
6525 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6526 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 for (p = self->str; p < e; p++)
6528 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006529 if (tabsize > 0) {
6530 incr = tabsize - (j % tabsize); /* cannot overflow */
6531 if (j > PY_SSIZE_T_MAX - incr)
6532 goto overflow1;
6533 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006534 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006537 if (j > PY_SSIZE_T_MAX - 1)
6538 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 j++;
6540 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006541 if (i > PY_SSIZE_T_MAX - j)
6542 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006544 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 }
6546 }
6547
Guido van Rossum5bdff602008-03-11 21:18:06 +00006548 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006549 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006550
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 /* Second pass: create output string and fill it */
6552 u = _PyUnicode_New(i + j);
6553 if (!u)
6554 return NULL;
6555
Guido van Rossum5bdff602008-03-11 21:18:06 +00006556 j = 0; /* same as in first pass */
6557 q = u->str; /* next output char */
6558 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559
6560 for (p = self->str; p < e; p++)
6561 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006562 if (tabsize > 0) {
6563 i = tabsize - (j % tabsize);
6564 j += i;
6565 while (i--) {
6566 if (q >= qe)
6567 goto overflow2;
6568 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006569 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006570 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006571 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006572 else {
6573 if (q >= qe)
6574 goto overflow2;
6575 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006576 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 if (*p == '\n' || *p == '\r')
6578 j = 0;
6579 }
6580
6581 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006582
6583 overflow2:
6584 Py_DECREF(u);
6585 overflow1:
6586 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588}
6589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006590PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006591 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592\n\
6593Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006594such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595arguments start and end are interpreted as in slice notation.\n\
6596\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006597Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598
6599static PyObject *
6600unicode_find(PyUnicodeObject *self, PyObject *args)
6601{
Jesus Cea44e81682011-04-20 16:39:15 +02006602 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006603 Py_ssize_t start;
6604 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006605 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606
Jesus Cea44e81682011-04-20 16:39:15 +02006607 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6608 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006611 result = stringlib_find_slice(
6612 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6613 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6614 start, end
6615 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616
6617 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006618
6619 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620}
6621
6622static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006623unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624{
6625 if (index < 0 || index >= self->length) {
6626 PyErr_SetString(PyExc_IndexError, "string index out of range");
6627 return NULL;
6628 }
6629
6630 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6631}
6632
6633static long
6634unicode_hash(PyUnicodeObject *self)
6635{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006636 /* Since Unicode objects compare equal to their ASCII string
6637 counterparts, they should use the individual character values
6638 as basis for their hash value. This is needed to assure that
6639 strings and Unicode objects behave in the same way as
6640 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641
Martin v. Löwis18e16552006-02-15 17:27:45 +00006642 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006643 register Py_UNICODE *p;
6644 register long x;
6645
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006646#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006647 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006648#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006650 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006651 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006652 /*
6653 We make the hash of the empty string be 0, rather than using
6654 (prefix ^ suffix), since this slightly obfuscates the hash secret
6655 */
6656 if (len == 0) {
6657 self->hash = 0;
6658 return 0;
6659 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006660 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006661 x = _Py_HashSecret.prefix;
6662 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006663 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006664 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006665 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006666 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006667 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006668 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006669 self->hash = x;
6670 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671}
6672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006673PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006674 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006676Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677
6678static PyObject *
6679unicode_index(PyUnicodeObject *self, PyObject *args)
6680{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006681 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006682 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006683 Py_ssize_t start;
6684 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685
Jesus Cea44e81682011-04-20 16:39:15 +02006686 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6687 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006690 result = stringlib_find_slice(
6691 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6692 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6693 start, end
6694 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695
6696 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 if (result < 0) {
6699 PyErr_SetString(PyExc_ValueError, "substring not found");
6700 return NULL;
6701 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006702
Martin v. Löwis18e16552006-02-15 17:27:45 +00006703 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704}
6705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006706PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006707 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006709Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006710at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711
6712static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006713unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714{
6715 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6716 register const Py_UNICODE *e;
6717 int cased;
6718
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719 /* Shortcut for single character strings */
6720 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006721 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006723 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006724 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006725 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006726
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727 e = p + PyUnicode_GET_SIZE(self);
6728 cased = 0;
6729 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006730 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006731
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006732 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6733 return PyBool_FromLong(0);
6734 else if (!cased && Py_UNICODE_ISLOWER(ch))
6735 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006737 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738}
6739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006740PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006741 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006743Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006744at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745
6746static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006747unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748{
6749 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6750 register const Py_UNICODE *e;
6751 int cased;
6752
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 /* Shortcut for single character strings */
6754 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006755 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006757 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006758 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006759 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006760
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 e = p + PyUnicode_GET_SIZE(self);
6762 cased = 0;
6763 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006764 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006765
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006766 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6767 return PyBool_FromLong(0);
6768 else if (!cased && Py_UNICODE_ISUPPER(ch))
6769 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006771 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772}
6773
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006774PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006775 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006777Return True if S is a titlecased string and there is at least one\n\
6778character in S, i.e. upper- and titlecase characters may only\n\
6779follow uncased characters and lowercase characters only cased ones.\n\
6780Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781
6782static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006783unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784{
6785 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6786 register const Py_UNICODE *e;
6787 int cased, previous_is_cased;
6788
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 /* Shortcut for single character strings */
6790 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006791 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6792 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006794 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006795 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006796 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006797
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 e = p + PyUnicode_GET_SIZE(self);
6799 cased = 0;
6800 previous_is_cased = 0;
6801 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006802 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006803
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006804 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6805 if (previous_is_cased)
6806 return PyBool_FromLong(0);
6807 previous_is_cased = 1;
6808 cased = 1;
6809 }
6810 else if (Py_UNICODE_ISLOWER(ch)) {
6811 if (!previous_is_cased)
6812 return PyBool_FromLong(0);
6813 previous_is_cased = 1;
6814 cased = 1;
6815 }
6816 else
6817 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006819 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820}
6821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006822PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006823 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006825Return True if all characters in S are whitespace\n\
6826and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827
6828static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006829unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830{
6831 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6832 register const Py_UNICODE *e;
6833
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 /* Shortcut for single character strings */
6835 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006836 Py_UNICODE_ISSPACE(*p))
6837 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006839 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006840 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006841 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006842
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 e = p + PyUnicode_GET_SIZE(self);
6844 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006845 if (!Py_UNICODE_ISSPACE(*p))
6846 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006848 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849}
6850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006851PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006852 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006853\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006854Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006855and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006856
6857static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006858unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006859{
6860 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6861 register const Py_UNICODE *e;
6862
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006863 /* Shortcut for single character strings */
6864 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006865 Py_UNICODE_ISALPHA(*p))
6866 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006867
6868 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006869 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006870 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006871
6872 e = p + PyUnicode_GET_SIZE(self);
6873 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006874 if (!Py_UNICODE_ISALPHA(*p))
6875 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006876 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006877 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006878}
6879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006880PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006881 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006882\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006883Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006884and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006885
6886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006887unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006888{
6889 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6890 register const Py_UNICODE *e;
6891
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006892 /* Shortcut for single character strings */
6893 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006894 Py_UNICODE_ISALNUM(*p))
6895 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006896
6897 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006898 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006899 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006900
6901 e = p + PyUnicode_GET_SIZE(self);
6902 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006903 if (!Py_UNICODE_ISALNUM(*p))
6904 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006905 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006906 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006907}
6908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006909PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006910 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006912Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006913False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914
6915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006916unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917{
6918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6919 register const Py_UNICODE *e;
6920
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 /* Shortcut for single character strings */
6922 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006923 Py_UNICODE_ISDECIMAL(*p))
6924 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006926 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006927 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006928 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006929
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 e = p + PyUnicode_GET_SIZE(self);
6931 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006932 if (!Py_UNICODE_ISDECIMAL(*p))
6933 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006935 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936}
6937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006938PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006939 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006941Return True if all characters in S are digits\n\
6942and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943
6944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006945unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946{
6947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6948 register const Py_UNICODE *e;
6949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 /* Shortcut for single character strings */
6951 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006952 Py_UNICODE_ISDIGIT(*p))
6953 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006955 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006956 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006957 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006958
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 e = p + PyUnicode_GET_SIZE(self);
6960 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006961 if (!Py_UNICODE_ISDIGIT(*p))
6962 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006964 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965}
6966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006967PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006968 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006970Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006971False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972
6973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006974unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975{
6976 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6977 register const Py_UNICODE *e;
6978
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 /* Shortcut for single character strings */
6980 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006981 Py_UNICODE_ISNUMERIC(*p))
6982 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006984 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006985 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006986 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006987
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988 e = p + PyUnicode_GET_SIZE(self);
6989 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006990 if (!Py_UNICODE_ISNUMERIC(*p))
6991 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006993 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994}
6995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006996PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006997 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998\n\
6999Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00007000iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001
7002static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007003unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007005 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006}
7007
Martin v. Löwis18e16552006-02-15 17:27:45 +00007008static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009unicode_length(PyUnicodeObject *self)
7010{
7011 return self->length;
7012}
7013
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007014PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007015 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007017Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007018done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019
7020static PyObject *
7021unicode_ljust(PyUnicodeObject *self, PyObject *args)
7022{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007023 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007024 Py_UNICODE fillchar = ' ';
7025
Martin v. Löwis412fb672006-04-13 06:34:32 +00007026 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027 return NULL;
7028
Tim Peters7a29bd52001-09-12 03:03:31 +00007029 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 Py_INCREF(self);
7031 return (PyObject*) self;
7032 }
7033
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007034 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035}
7036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007037PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007038 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007040Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041
7042static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007043unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 return fixup(self, fixlower);
7046}
7047
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007048#define LEFTSTRIP 0
7049#define RIGHTSTRIP 1
7050#define BOTHSTRIP 2
7051
7052/* Arrays indexed by above */
7053static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7054
7055#define STRIPNAME(i) (stripformat[i]+3)
7056
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007057/* externally visible for str.strip(unicode) */
7058PyObject *
7059_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7060{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007061 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7062 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7063 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7064 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7065 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007066
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007067 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007068
Benjamin Peterson857ce152009-01-31 16:29:18 +00007069 i = 0;
7070 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007071 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7072 i++;
7073 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007074 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007075
Benjamin Peterson857ce152009-01-31 16:29:18 +00007076 j = len;
7077 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007078 do {
7079 j--;
7080 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7081 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007082 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007083
Benjamin Peterson857ce152009-01-31 16:29:18 +00007084 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007085 Py_INCREF(self);
7086 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007087 }
7088 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007089 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007090}
7091
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092
7093static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007094do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007096 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7097 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007098
Benjamin Peterson857ce152009-01-31 16:29:18 +00007099 i = 0;
7100 if (striptype != RIGHTSTRIP) {
7101 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7102 i++;
7103 }
7104 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007105
Benjamin Peterson857ce152009-01-31 16:29:18 +00007106 j = len;
7107 if (striptype != LEFTSTRIP) {
7108 do {
7109 j--;
7110 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7111 j++;
7112 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007113
Benjamin Peterson857ce152009-01-31 16:29:18 +00007114 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7115 Py_INCREF(self);
7116 return (PyObject*)self;
7117 }
7118 else
7119 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120}
7121
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007122
7123static PyObject *
7124do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7125{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007126 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007127
Benjamin Peterson857ce152009-01-31 16:29:18 +00007128 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7129 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007130
Benjamin Peterson857ce152009-01-31 16:29:18 +00007131 if (sep != NULL && sep != Py_None) {
7132 if (PyUnicode_Check(sep))
7133 return _PyUnicode_XStrip(self, striptype, sep);
7134 else if (PyString_Check(sep)) {
7135 PyObject *res;
7136 sep = PyUnicode_FromObject(sep);
7137 if (sep==NULL)
7138 return NULL;
7139 res = _PyUnicode_XStrip(self, striptype, sep);
7140 Py_DECREF(sep);
7141 return res;
7142 }
7143 else {
7144 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007145 "%s arg must be None, unicode or str",
7146 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007147 return NULL;
7148 }
7149 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007150
Benjamin Peterson857ce152009-01-31 16:29:18 +00007151 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007152}
7153
7154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007155PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007156 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007157\n\
7158Return a copy of the string S with leading and trailing\n\
7159whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007160If chars is given and not None, remove characters in chars instead.\n\
7161If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007162
7163static PyObject *
7164unicode_strip(PyUnicodeObject *self, PyObject *args)
7165{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007166 if (PyTuple_GET_SIZE(args) == 0)
7167 return do_strip(self, BOTHSTRIP); /* Common case */
7168 else
7169 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007170}
7171
7172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007173PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007174 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007175\n\
7176Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007177If chars is given and not None, remove characters in chars instead.\n\
7178If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007179
7180static PyObject *
7181unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7182{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007183 if (PyTuple_GET_SIZE(args) == 0)
7184 return do_strip(self, LEFTSTRIP); /* Common case */
7185 else
7186 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007187}
7188
7189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007190PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007191 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007192\n\
7193Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007194If chars is given and not None, remove characters in chars instead.\n\
7195If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007196
7197static PyObject *
7198unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7199{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007200 if (PyTuple_GET_SIZE(args) == 0)
7201 return do_strip(self, RIGHTSTRIP); /* Common case */
7202 else
7203 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007204}
7205
7206
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007208unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209{
7210 PyUnicodeObject *u;
7211 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007212 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007213 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214
7215 if (len < 0)
7216 len = 0;
7217
Tim Peters7a29bd52001-09-12 03:03:31 +00007218 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 /* no repeat, return original string */
7220 Py_INCREF(str);
7221 return (PyObject*) str;
7222 }
Tim Peters8f422462000-09-09 06:13:41 +00007223
7224 /* ensure # of chars needed doesn't overflow int and # of bytes
7225 * needed doesn't overflow size_t
7226 */
7227 nchars = len * str->length;
7228 if (len && nchars / len != str->length) {
7229 PyErr_SetString(PyExc_OverflowError,
7230 "repeated string is too long");
7231 return NULL;
7232 }
7233 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7234 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7235 PyErr_SetString(PyExc_OverflowError,
7236 "repeated string is too long");
7237 return NULL;
7238 }
7239 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 if (!u)
7241 return NULL;
7242
7243 p = u->str;
7244
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007245 if (str->length == 1 && len > 0) {
7246 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007247 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007248 Py_ssize_t done = 0; /* number of characters copied this far */
7249 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007250 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007251 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007252 }
7253 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007254 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007255 Py_UNICODE_COPY(p+done, p, n);
7256 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007257 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259
7260 return (PyObject*) u;
7261}
7262
7263PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007264 PyObject *subobj,
7265 PyObject *replobj,
7266 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267{
7268 PyObject *self;
7269 PyObject *str1;
7270 PyObject *str2;
7271 PyObject *result;
7272
7273 self = PyUnicode_FromObject(obj);
7274 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007275 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 str1 = PyUnicode_FromObject(subobj);
7277 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007278 Py_DECREF(self);
7279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280 }
7281 str2 = PyUnicode_FromObject(replobj);
7282 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007283 Py_DECREF(self);
7284 Py_DECREF(str1);
7285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286 }
Tim Petersced69f82003-09-16 20:30:58 +00007287 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007288 (PyUnicodeObject *)str1,
7289 (PyUnicodeObject *)str2,
7290 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291 Py_DECREF(self);
7292 Py_DECREF(str1);
7293 Py_DECREF(str2);
7294 return result;
7295}
7296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007297PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007298 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299\n\
7300Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007301old replaced by new. If the optional argument count is\n\
7302given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303
7304static PyObject*
7305unicode_replace(PyUnicodeObject *self, PyObject *args)
7306{
7307 PyUnicodeObject *str1;
7308 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007309 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 PyObject *result;
7311
Martin v. Löwis18e16552006-02-15 17:27:45 +00007312 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 return NULL;
7314 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7315 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007318 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007319 Py_DECREF(str1);
7320 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322
7323 result = replace(self, str1, str2, maxcount);
7324
7325 Py_DECREF(str1);
7326 Py_DECREF(str2);
7327 return result;
7328}
7329
7330static
7331PyObject *unicode_repr(PyObject *unicode)
7332{
7333 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007334 PyUnicode_GET_SIZE(unicode),
7335 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336}
7337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007338PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007339 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340\n\
7341Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007342such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343arguments start and end are interpreted as in slice notation.\n\
7344\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007345Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346
7347static PyObject *
7348unicode_rfind(PyUnicodeObject *self, PyObject *args)
7349{
Jesus Cea44e81682011-04-20 16:39:15 +02007350 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007351 Py_ssize_t start;
7352 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007353 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354
Jesus Cea44e81682011-04-20 16:39:15 +02007355 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7356 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007359 result = stringlib_rfind_slice(
7360 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7361 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7362 start, end
7363 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364
7365 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007366
7367 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368}
7369
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007370PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007371 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007373Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374
7375static PyObject *
7376unicode_rindex(PyUnicodeObject *self, PyObject *args)
7377{
Jesus Cea44e81682011-04-20 16:39:15 +02007378 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007379 Py_ssize_t start;
7380 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007381 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382
Jesus Cea44e81682011-04-20 16:39:15 +02007383 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7384 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007387 result = stringlib_rfind_slice(
7388 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7389 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7390 start, end
7391 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392
7393 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007394
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395 if (result < 0) {
7396 PyErr_SetString(PyExc_ValueError, "substring not found");
7397 return NULL;
7398 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007399 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400}
7401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007402PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007403 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007405Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007406done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407
7408static PyObject *
7409unicode_rjust(PyUnicodeObject *self, PyObject *args)
7410{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007411 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007412 Py_UNICODE fillchar = ' ';
7413
Martin v. Löwis412fb672006-04-13 06:34:32 +00007414 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 return NULL;
7416
Tim Peters7a29bd52001-09-12 03:03:31 +00007417 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 Py_INCREF(self);
7419 return (PyObject*) self;
7420 }
7421
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007422 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423}
7424
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007426unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427{
7428 /* standard clamping */
7429 if (start < 0)
7430 start = 0;
7431 if (end < 0)
7432 end = 0;
7433 if (end > self->length)
7434 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007435 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 /* full slice, return original string */
7437 Py_INCREF(self);
7438 return (PyObject*) self;
7439 }
7440 if (start > end)
7441 start = end;
7442 /* copy slice */
7443 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007444 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445}
7446
7447PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007448 PyObject *sep,
7449 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450{
7451 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007452
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 s = PyUnicode_FromObject(s);
7454 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007455 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007456 if (sep != NULL) {
7457 sep = PyUnicode_FromObject(sep);
7458 if (sep == NULL) {
7459 Py_DECREF(s);
7460 return NULL;
7461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 }
7463
7464 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7465
7466 Py_DECREF(s);
7467 Py_XDECREF(sep);
7468 return result;
7469}
7470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007471PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007472 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473\n\
7474Return a list of the words in S, using sep as the\n\
7475delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007476splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007477whitespace string is a separator and empty strings are\n\
7478removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
7480static PyObject*
7481unicode_split(PyUnicodeObject *self, PyObject *args)
7482{
7483 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007484 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485
Martin v. Löwis18e16552006-02-15 17:27:45 +00007486 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487 return NULL;
7488
7489 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007490 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007492 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007494 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495}
7496
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007497PyObject *
7498PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7499{
7500 PyObject* str_obj;
7501 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007502 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007503
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007504 str_obj = PyUnicode_FromObject(str_in);
7505 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007506 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007507 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007508 if (!sep_obj) {
7509 Py_DECREF(str_obj);
7510 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007511 }
7512
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007513 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007514 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7515 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7516 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007517
Fredrik Lundhb9479482006-05-26 17:22:38 +00007518 Py_DECREF(sep_obj);
7519 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007520
7521 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007522}
7523
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007524
7525PyObject *
7526PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7527{
7528 PyObject* str_obj;
7529 PyObject* sep_obj;
7530 PyObject* out;
7531
7532 str_obj = PyUnicode_FromObject(str_in);
7533 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007534 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007535 sep_obj = PyUnicode_FromObject(sep_in);
7536 if (!sep_obj) {
7537 Py_DECREF(str_obj);
7538 return NULL;
7539 }
7540
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007541 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007542 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7543 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7544 );
7545
7546 Py_DECREF(sep_obj);
7547 Py_DECREF(str_obj);
7548
7549 return out;
7550}
7551
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007552PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007553 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007554\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007555Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007556the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007557found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007558
7559static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007560unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007561{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007562 return PyUnicode_Partition((PyObject *)self, separator);
7563}
7564
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007565PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007566 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007567\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007568Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007569the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007570separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007571
7572static PyObject*
7573unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7574{
7575 return PyUnicode_RPartition((PyObject *)self, separator);
7576}
7577
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007578PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007579 PyObject *sep,
7580 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007581{
7582 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007583
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007584 s = PyUnicode_FromObject(s);
7585 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007586 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007587 if (sep != NULL) {
7588 sep = PyUnicode_FromObject(sep);
7589 if (sep == NULL) {
7590 Py_DECREF(s);
7591 return NULL;
7592 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007593 }
7594
7595 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7596
7597 Py_DECREF(s);
7598 Py_XDECREF(sep);
7599 return result;
7600}
7601
7602PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007603 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007604\n\
7605Return a list of the words in S, using sep as the\n\
7606delimiter string, starting at the end of the string and\n\
7607working to the front. If maxsplit is given, at most maxsplit\n\
7608splits are done. If sep is not specified, any whitespace string\n\
7609is a separator.");
7610
7611static PyObject*
7612unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7613{
7614 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007615 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007616
Martin v. Löwis18e16552006-02-15 17:27:45 +00007617 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007618 return NULL;
7619
7620 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007621 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007622 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007623 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007624 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007625 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007626}
7627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007628PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007629 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630\n\
7631Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007632Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007633is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634
7635static PyObject*
7636unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7637{
Guido van Rossum86662912000-04-11 15:38:46 +00007638 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639
Guido van Rossum86662912000-04-11 15:38:46 +00007640 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641 return NULL;
7642
Guido van Rossum86662912000-04-11 15:38:46 +00007643 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644}
7645
7646static
7647PyObject *unicode_str(PyUnicodeObject *self)
7648{
Fred Drakee4315f52000-05-09 19:53:39 +00007649 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650}
7651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007652PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007653 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654\n\
7655Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007656and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657
7658static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007659unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661 return fixup(self, fixswapcase);
7662}
7663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007664PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007665 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666\n\
7667Return a copy of the string S, where all characters have been mapped\n\
7668through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007669Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7670Unmapped characters are left untouched. Characters mapped to None\n\
7671are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672
7673static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007674unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675{
Tim Petersced69f82003-09-16 20:30:58 +00007676 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007677 self->length,
7678 table,
7679 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680}
7681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007682PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007683 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007685Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686
7687static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007688unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690 return fixup(self, fixupper);
7691}
7692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007693PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007694 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695\n\
Georg Brandl98064072008-09-09 19:26:00 +00007696Pad a numeric string S with zeros on the left, to fill a field\n\
7697of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698
7699static PyObject *
7700unicode_zfill(PyUnicodeObject *self, PyObject *args)
7701{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007702 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703 PyUnicodeObject *u;
7704
Martin v. Löwis18e16552006-02-15 17:27:45 +00007705 Py_ssize_t width;
7706 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707 return NULL;
7708
7709 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007710 if (PyUnicode_CheckExact(self)) {
7711 Py_INCREF(self);
7712 return (PyObject*) self;
7713 }
7714 else
7715 return PyUnicode_FromUnicode(
7716 PyUnicode_AS_UNICODE(self),
7717 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007718 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719 }
7720
7721 fill = width - self->length;
7722
7723 u = pad(self, fill, 0, '0');
7724
Walter Dörwald068325e2002-04-15 13:36:47 +00007725 if (u == NULL)
7726 return NULL;
7727
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 if (u->str[fill] == '+' || u->str[fill] == '-') {
7729 /* move sign to beginning of string */
7730 u->str[0] = u->str[fill];
7731 u->str[fill] = '0';
7732 }
7733
7734 return (PyObject*) u;
7735}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736
7737#if 0
7738static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007739free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007741 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742}
7743#endif
7744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007745PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007746 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007748Return True if S starts with the specified prefix, False otherwise.\n\
7749With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007750With optional end, stop comparing S at that position.\n\
7751prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752
7753static PyObject *
7754unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007755 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756{
Georg Brandl24250812006-06-09 18:45:48 +00007757 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007759 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007760 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007761 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762
Jesus Cea44e81682011-04-20 16:39:15 +02007763 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007764 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007765 if (PyTuple_Check(subobj)) {
7766 Py_ssize_t i;
7767 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7768 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007769 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007770 if (substring == NULL)
7771 return NULL;
7772 result = tailmatch(self, substring, start, end, -1);
7773 Py_DECREF(substring);
7774 if (result) {
7775 Py_RETURN_TRUE;
7776 }
7777 }
7778 /* nothing matched */
7779 Py_RETURN_FALSE;
7780 }
7781 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007782 if (substring == NULL) {
7783 if (PyErr_ExceptionMatches(PyExc_TypeError))
7784 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7785 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007786 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007787 }
Georg Brandl24250812006-06-09 18:45:48 +00007788 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007790 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791}
7792
7793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007794PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007795 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007797Return True if S ends with the specified suffix, False otherwise.\n\
7798With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007799With optional end, stop comparing S at that position.\n\
7800suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801
7802static PyObject *
7803unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007804 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805{
Georg Brandl24250812006-06-09 18:45:48 +00007806 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007808 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007809 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007810 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811
Jesus Cea44e81682011-04-20 16:39:15 +02007812 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007813 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007814 if (PyTuple_Check(subobj)) {
7815 Py_ssize_t i;
7816 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7817 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007818 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007819 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007820 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007821 result = tailmatch(self, substring, start, end, +1);
7822 Py_DECREF(substring);
7823 if (result) {
7824 Py_RETURN_TRUE;
7825 }
7826 }
7827 Py_RETURN_FALSE;
7828 }
7829 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007830 if (substring == NULL) {
7831 if (PyErr_ExceptionMatches(PyExc_TypeError))
7832 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7833 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007834 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007835 }
Georg Brandl24250812006-06-09 18:45:48 +00007836 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007838 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839}
7840
7841
Eric Smitha9f7d622008-02-17 19:46:49 +00007842/* Implements do_string_format, which is unicode because of stringlib */
7843#include "stringlib/string_format.h"
7844
7845PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007846 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007847\n\
Eric Smith6c840852010-11-06 19:43:44 +00007848Return a formatted version of S, using substitutions from args and kwargs.\n\
7849The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007850
Eric Smithdc13b792008-05-30 18:10:04 +00007851static PyObject *
7852unicode__format__(PyObject *self, PyObject *args)
7853{
7854 PyObject *format_spec;
7855 PyObject *result = NULL;
7856 PyObject *tmp = NULL;
7857
7858 /* If 2.x, convert format_spec to the same type as value */
7859 /* This is to allow things like u''.format('') */
7860 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7861 goto done;
7862 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7863 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007864 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007865 goto done;
7866 }
7867 tmp = PyObject_Unicode(format_spec);
7868 if (tmp == NULL)
7869 goto done;
7870 format_spec = tmp;
7871
7872 result = _PyUnicode_FormatAdvanced(self,
7873 PyUnicode_AS_UNICODE(format_spec),
7874 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007875 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007876 Py_XDECREF(tmp);
7877 return result;
7878}
7879
Eric Smitha9f7d622008-02-17 19:46:49 +00007880PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007881 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007882\n\
Eric Smith6c840852010-11-06 19:43:44 +00007883Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007884
Robert Schuppenies901c9972008-06-10 10:10:31 +00007885static PyObject *
7886unicode__sizeof__(PyUnicodeObject *v)
7887{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007888 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7889 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007890}
7891
7892PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007893 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007894\n\
7895");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007896
7897static PyObject *
7898unicode_getnewargs(PyUnicodeObject *v)
7899{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007900 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007901}
7902
7903
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007905 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007906 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7907 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007908 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007909 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7910 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7911 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7912 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7913 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7914 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7915 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007916 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007917 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7918 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7919 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007920 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007921 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007922/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7923 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7924 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7925 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007926 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007927 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007928 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007929 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007930 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7931 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7932 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7933 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7934 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7935 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7936 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7937 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7938 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7939 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7940 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7941 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7942 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7943 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007944 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007945 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7946 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7947 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7948 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007949 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007950#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007951 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952#endif
7953
7954#if 0
7955 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007956 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957#endif
7958
Benjamin Peterson857ce152009-01-31 16:29:18 +00007959 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 {NULL, NULL}
7961};
7962
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007963static PyObject *
7964unicode_mod(PyObject *v, PyObject *w)
7965{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007966 if (!PyUnicode_Check(v)) {
7967 Py_INCREF(Py_NotImplemented);
7968 return Py_NotImplemented;
7969 }
7970 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007971}
7972
7973static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007974 0, /*nb_add*/
7975 0, /*nb_subtract*/
7976 0, /*nb_multiply*/
7977 0, /*nb_divide*/
7978 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007979};
7980
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007982 (lenfunc) unicode_length, /* sq_length */
7983 PyUnicode_Concat, /* sq_concat */
7984 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7985 (ssizeargfunc) unicode_getitem, /* sq_item */
7986 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7987 0, /* sq_ass_item */
7988 0, /* sq_ass_slice */
7989 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990};
7991
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007992static PyObject*
7993unicode_subscript(PyUnicodeObject* self, PyObject* item)
7994{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007995 if (PyIndex_Check(item)) {
7996 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007997 if (i == -1 && PyErr_Occurred())
7998 return NULL;
7999 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008000 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008001 return unicode_getitem(self, i);
8002 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008003 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008004 Py_UNICODE* source_buf;
8005 Py_UNICODE* result_buf;
8006 PyObject* result;
8007
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008008 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008009 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008010 return NULL;
8011 }
8012
8013 if (slicelength <= 0) {
8014 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008015 } else if (start == 0 && step == 1 && slicelength == self->length &&
8016 PyUnicode_CheckExact(self)) {
8017 Py_INCREF(self);
8018 return (PyObject *)self;
8019 } else if (step == 1) {
8020 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008021 } else {
8022 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008023 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8024 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008025
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008026 if (result_buf == NULL)
8027 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008028
8029 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8030 result_buf[i] = source_buf[cur];
8031 }
Tim Petersced69f82003-09-16 20:30:58 +00008032
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008033 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008034 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008035 return result;
8036 }
8037 } else {
8038 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8039 return NULL;
8040 }
8041}
8042
8043static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008044 (lenfunc)unicode_length, /* mp_length */
8045 (binaryfunc)unicode_subscript, /* mp_subscript */
8046 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008047};
8048
Martin v. Löwis18e16552006-02-15 17:27:45 +00008049static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008051 Py_ssize_t index,
8052 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053{
8054 if (index != 0) {
8055 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008056 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 return -1;
8058 }
8059 *ptr = (void *) self->str;
8060 return PyUnicode_GET_DATA_SIZE(self);
8061}
8062
Martin v. Löwis18e16552006-02-15 17:27:45 +00008063static Py_ssize_t
8064unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008065 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066{
8067 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008068 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069 return -1;
8070}
8071
8072static int
8073unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008074 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075{
8076 if (lenp)
8077 *lenp = PyUnicode_GET_DATA_SIZE(self);
8078 return 1;
8079}
8080
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008081static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008083 Py_ssize_t index,
8084 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085{
8086 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008087
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088 if (index != 0) {
8089 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008090 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 return -1;
8092 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008093 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008095 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008096 *ptr = (void *) PyString_AS_STRING(str);
8097 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098}
8099
8100/* Helpers for PyUnicode_Format() */
8101
8102static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008105 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008107 (*p_argidx)++;
8108 if (arglen < 0)
8109 return args;
8110 else
8111 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112 }
8113 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008114 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 return NULL;
8116}
8117
8118#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008119#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008121#define F_ALT (1<<3)
8122#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123
Martin v. Löwis18e16552006-02-15 17:27:45 +00008124static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008125strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008127 register Py_ssize_t i;
8128 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008130 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132 return len;
8133}
8134
Neal Norwitzfc76d632006-01-10 06:03:13 +00008135static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008136longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8137{
Tim Peters15231542006-02-16 01:08:01 +00008138 Py_ssize_t result;
8139
Neal Norwitzfc76d632006-01-10 06:03:13 +00008140 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008141 result = strtounicode(buffer, (char *)buffer);
8142 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008143}
8144
Guido van Rossum078151d2002-08-11 04:24:12 +00008145/* XXX To save some code duplication, formatfloat/long/int could have been
8146 shared with stringobject.c, converting from 8-bit to Unicode after the
8147 formatting is done. */
8148
Mark Dickinson18cfada2009-11-23 18:46:41 +00008149/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8150
8151static PyObject *
8152formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008154 char *p;
8155 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008157
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 x = PyFloat_AsDouble(v);
8159 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008160 return NULL;
8161
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008163 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008164
Mark Dickinson18cfada2009-11-23 18:46:41 +00008165 p = PyOS_double_to_string(x, type, prec,
8166 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8167 if (p == NULL)
8168 return NULL;
8169 result = PyUnicode_FromStringAndSize(p, strlen(p));
8170 PyMem_Free(p);
8171 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172}
8173
Tim Peters38fd5b62000-09-21 05:43:11 +00008174static PyObject*
8175formatlong(PyObject *val, int flags, int prec, int type)
8176{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008177 char *buf;
8178 int i, len;
8179 PyObject *str; /* temporary string object. */
8180 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008181
Benjamin Peterson857ce152009-01-31 16:29:18 +00008182 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8183 if (!str)
8184 return NULL;
8185 result = _PyUnicode_New(len);
8186 if (!result) {
8187 Py_DECREF(str);
8188 return NULL;
8189 }
8190 for (i = 0; i < len; i++)
8191 result->str[i] = buf[i];
8192 result->str[len] = 0;
8193 Py_DECREF(str);
8194 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008195}
8196
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197static int
8198formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008199 size_t buflen,
8200 int flags,
8201 int prec,
8202 int type,
8203 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008205 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008206 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8207 * + 1 + 1
8208 * = 24
8209 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008210 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008211 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212 long x;
8213
8214 x = PyInt_AsLong(v);
8215 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008216 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008217 if (x < 0 && type == 'u') {
8218 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008219 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008220 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8221 sign = "-";
8222 else
8223 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008225 prec = 1;
8226
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008227 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8228 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008229 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008230 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008231 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008232 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008233 return -1;
8234 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008235
8236 if ((flags & F_ALT) &&
8237 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008238 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008239 * of issues that cause pain:
8240 * - when 0 is being converted, the C standard leaves off
8241 * the '0x' or '0X', which is inconsistent with other
8242 * %#x/%#X conversions and inconsistent with Python's
8243 * hex() function
8244 * - there are platforms that violate the standard and
8245 * convert 0 with the '0x' or '0X'
8246 * (Metrowerks, Compaq Tru64)
8247 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008248 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008249 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008250 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008251 * We can achieve the desired consistency by inserting our
8252 * own '0x' or '0X' prefix, and substituting %x/%X in place
8253 * of %#x/%#X.
8254 *
8255 * Note that this is the same approach as used in
8256 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008257 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008258 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8259 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008260 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008261 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008262 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8263 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008264 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008265 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008266 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008267 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008268 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008269 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270}
8271
8272static int
8273formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008274 size_t buflen,
8275 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276{
Ezio Melotti32125152010-02-25 17:36:04 +00008277 PyObject *unistr;
8278 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008279 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008280 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008281 if (PyUnicode_GET_SIZE(v) != 1)
8282 goto onError;
8283 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008286 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008287 if (PyString_GET_SIZE(v) != 1)
8288 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008289 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8290 with a UnicodeDecodeError if 'char' is not decodable with the
8291 default encoding (usually ASCII, but it might be something else) */
8292 str = PyString_AS_STRING(v);
8293 if ((unsigned char)str[0] > 0x7F) {
8294 /* the char is not ASCII; try to decode the string using the
8295 default encoding and return -1 to let the UnicodeDecodeError
8296 be raised if the string can't be decoded */
8297 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8298 if (unistr == NULL)
8299 return -1;
8300 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8301 Py_DECREF(unistr);
8302 }
8303 else
8304 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306
8307 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008308 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008310 x = PyInt_AsLong(v);
8311 if (x == -1 && PyErr_Occurred())
8312 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008313#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008314 if (x < 0 || x > 0x10ffff) {
8315 PyErr_SetString(PyExc_OverflowError,
8316 "%c arg not in range(0x110000) "
8317 "(wide Python build)");
8318 return -1;
8319 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008320#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008321 if (x < 0 || x > 0xffff) {
8322 PyErr_SetString(PyExc_OverflowError,
8323 "%c arg not in range(0x10000) "
8324 "(narrow Python build)");
8325 return -1;
8326 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008327#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008328 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 }
8330 buf[1] = '\0';
8331 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008332
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008333 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008334 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008335 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008336 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337}
8338
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008339/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8340
Mark Dickinson18cfada2009-11-23 18:46:41 +00008341 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008342 chars are formatted. XXX This is a magic number. Each formatting
8343 routine does bounds checking to ensure no overflow, but a better
8344 solution may be to malloc a buffer of appropriate size for each
8345 format. For now, the current solution is sufficient.
8346*/
8347#define FORMATBUFLEN (size_t)120
8348
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008350 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351{
8352 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008353 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 int args_owned = 0;
8355 PyUnicodeObject *result = NULL;
8356 PyObject *dict = NULL;
8357 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008358
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008360 PyErr_BadInternalCall();
8361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 }
8363 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008364 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 fmt = PyUnicode_AS_UNICODE(uformat);
8367 fmtcnt = PyUnicode_GET_SIZE(uformat);
8368
8369 reslen = rescnt = fmtcnt + 100;
8370 result = _PyUnicode_New(reslen);
8371 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008372 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 res = PyUnicode_AS_UNICODE(result);
8374
8375 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008376 arglen = PyTuple_Size(args);
8377 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 }
8379 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008380 arglen = -1;
8381 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 }
Benjamin Petersonda2c7eb2013-03-23 22:32:00 -05008383 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8384 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008385 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386
8387 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008388 if (*fmt != '%') {
8389 if (--rescnt < 0) {
8390 rescnt = fmtcnt + 100;
8391 reslen += rescnt;
8392 if (_PyUnicode_Resize(&result, reslen) < 0)
8393 goto onError;
8394 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8395 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008396 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008397 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008398 }
8399 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008400 /* Got a format specifier */
8401 int flags = 0;
8402 Py_ssize_t width = -1;
8403 int prec = -1;
8404 Py_UNICODE c = '\0';
8405 Py_UNICODE fill;
8406 int isnumok;
8407 PyObject *v = NULL;
8408 PyObject *temp = NULL;
8409 Py_UNICODE *pbuf;
8410 Py_UNICODE sign;
8411 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008412 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008413
8414 fmt++;
8415 if (*fmt == '(') {
8416 Py_UNICODE *keystart;
8417 Py_ssize_t keylen;
8418 PyObject *key;
8419 int pcount = 1;
8420
8421 if (dict == NULL) {
8422 PyErr_SetString(PyExc_TypeError,
8423 "format requires a mapping");
8424 goto onError;
8425 }
8426 ++fmt;
8427 --fmtcnt;
8428 keystart = fmt;
8429 /* Skip over balanced parentheses */
8430 while (pcount > 0 && --fmtcnt >= 0) {
8431 if (*fmt == ')')
8432 --pcount;
8433 else if (*fmt == '(')
8434 ++pcount;
8435 fmt++;
8436 }
8437 keylen = fmt - keystart - 1;
8438 if (fmtcnt < 0 || pcount > 0) {
8439 PyErr_SetString(PyExc_ValueError,
8440 "incomplete format key");
8441 goto onError;
8442 }
8443#if 0
8444 /* keys are converted to strings using UTF-8 and
8445 then looked up since Python uses strings to hold
8446 variables names etc. in its namespaces and we
8447 wouldn't want to break common idioms. */
8448 key = PyUnicode_EncodeUTF8(keystart,
8449 keylen,
8450 NULL);
8451#else
8452 key = PyUnicode_FromUnicode(keystart, keylen);
8453#endif
8454 if (key == NULL)
8455 goto onError;
8456 if (args_owned) {
8457 Py_DECREF(args);
8458 args_owned = 0;
8459 }
8460 args = PyObject_GetItem(dict, key);
8461 Py_DECREF(key);
8462 if (args == NULL) {
8463 goto onError;
8464 }
8465 args_owned = 1;
8466 arglen = -1;
8467 argidx = -2;
8468 }
8469 while (--fmtcnt >= 0) {
8470 switch (c = *fmt++) {
8471 case '-': flags |= F_LJUST; continue;
8472 case '+': flags |= F_SIGN; continue;
8473 case ' ': flags |= F_BLANK; continue;
8474 case '#': flags |= F_ALT; continue;
8475 case '0': flags |= F_ZERO; continue;
8476 }
8477 break;
8478 }
8479 if (c == '*') {
8480 v = getnextarg(args, arglen, &argidx);
8481 if (v == NULL)
8482 goto onError;
8483 if (!PyInt_Check(v)) {
8484 PyErr_SetString(PyExc_TypeError,
8485 "* wants int");
8486 goto onError;
8487 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008488 width = PyInt_AsSsize_t(v);
8489 if (width == -1 && PyErr_Occurred())
8490 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008491 if (width < 0) {
8492 flags |= F_LJUST;
8493 width = -width;
8494 }
8495 if (--fmtcnt >= 0)
8496 c = *fmt++;
8497 }
8498 else if (c >= '0' && c <= '9') {
8499 width = c - '0';
8500 while (--fmtcnt >= 0) {
8501 c = *fmt++;
8502 if (c < '0' || c > '9')
8503 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008504 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008505 PyErr_SetString(PyExc_ValueError,
8506 "width too big");
8507 goto onError;
8508 }
8509 width = width*10 + (c - '0');
8510 }
8511 }
8512 if (c == '.') {
8513 prec = 0;
8514 if (--fmtcnt >= 0)
8515 c = *fmt++;
8516 if (c == '*') {
8517 v = getnextarg(args, arglen, &argidx);
8518 if (v == NULL)
8519 goto onError;
8520 if (!PyInt_Check(v)) {
8521 PyErr_SetString(PyExc_TypeError,
8522 "* wants int");
8523 goto onError;
8524 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008525 prec = _PyInt_AsInt(v);
8526 if (prec == -1 && PyErr_Occurred())
8527 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008528 if (prec < 0)
8529 prec = 0;
8530 if (--fmtcnt >= 0)
8531 c = *fmt++;
8532 }
8533 else if (c >= '0' && c <= '9') {
8534 prec = c - '0';
8535 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008536 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008537 if (c < '0' || c > '9')
8538 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008539 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008540 PyErr_SetString(PyExc_ValueError,
8541 "prec too big");
8542 goto onError;
8543 }
8544 prec = prec*10 + (c - '0');
8545 }
8546 }
8547 } /* prec */
8548 if (fmtcnt >= 0) {
8549 if (c == 'h' || c == 'l' || c == 'L') {
8550 if (--fmtcnt >= 0)
8551 c = *fmt++;
8552 }
8553 }
8554 if (fmtcnt < 0) {
8555 PyErr_SetString(PyExc_ValueError,
8556 "incomplete format");
8557 goto onError;
8558 }
8559 if (c != '%') {
8560 v = getnextarg(args, arglen, &argidx);
8561 if (v == NULL)
8562 goto onError;
8563 }
8564 sign = 0;
8565 fill = ' ';
8566 switch (c) {
8567
8568 case '%':
8569 pbuf = formatbuf;
8570 /* presume that buffer length is at least 1 */
8571 pbuf[0] = '%';
8572 len = 1;
8573 break;
8574
8575 case 's':
8576 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008577 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008578 temp = v;
8579 Py_INCREF(temp);
8580 }
8581 else {
8582 PyObject *unicode;
8583 if (c == 's')
8584 temp = PyObject_Unicode(v);
8585 else
8586 temp = PyObject_Repr(v);
8587 if (temp == NULL)
8588 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008589 if (PyUnicode_Check(temp))
8590 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008591 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008592 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008593 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8594 PyString_GET_SIZE(temp),
8595 NULL,
8596 "strict");
8597 Py_DECREF(temp);
8598 temp = unicode;
8599 if (temp == NULL)
8600 goto onError;
8601 }
8602 else {
8603 Py_DECREF(temp);
8604 PyErr_SetString(PyExc_TypeError,
8605 "%s argument has non-string str()");
8606 goto onError;
8607 }
8608 }
8609 pbuf = PyUnicode_AS_UNICODE(temp);
8610 len = PyUnicode_GET_SIZE(temp);
8611 if (prec >= 0 && len > prec)
8612 len = prec;
8613 break;
8614
8615 case 'i':
8616 case 'd':
8617 case 'u':
8618 case 'o':
8619 case 'x':
8620 case 'X':
8621 if (c == 'i')
8622 c = 'd';
8623 isnumok = 0;
8624 if (PyNumber_Check(v)) {
8625 PyObject *iobj=NULL;
8626
8627 if (PyInt_Check(v) || (PyLong_Check(v))) {
8628 iobj = v;
8629 Py_INCREF(iobj);
8630 }
8631 else {
8632 iobj = PyNumber_Int(v);
8633 if (iobj==NULL) iobj = PyNumber_Long(v);
8634 }
8635 if (iobj!=NULL) {
8636 if (PyInt_Check(iobj)) {
8637 isnumok = 1;
8638 pbuf = formatbuf;
8639 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8640 flags, prec, c, iobj);
8641 Py_DECREF(iobj);
8642 if (len < 0)
8643 goto onError;
8644 sign = 1;
8645 }
8646 else if (PyLong_Check(iobj)) {
8647 isnumok = 1;
8648 temp = formatlong(iobj, flags, prec, c);
8649 Py_DECREF(iobj);
8650 if (!temp)
8651 goto onError;
8652 pbuf = PyUnicode_AS_UNICODE(temp);
8653 len = PyUnicode_GET_SIZE(temp);
8654 sign = 1;
8655 }
8656 else {
8657 Py_DECREF(iobj);
8658 }
8659 }
8660 }
8661 if (!isnumok) {
8662 PyErr_Format(PyExc_TypeError,
8663 "%%%c format: a number is required, "
8664 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8665 goto onError;
8666 }
8667 if (flags & F_ZERO)
8668 fill = '0';
8669 break;
8670
8671 case 'e':
8672 case 'E':
8673 case 'f':
8674 case 'F':
8675 case 'g':
8676 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008677 temp = formatfloat(v, flags, prec, c);
8678 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008679 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008680 pbuf = PyUnicode_AS_UNICODE(temp);
8681 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008682 sign = 1;
8683 if (flags & F_ZERO)
8684 fill = '0';
8685 break;
8686
8687 case 'c':
8688 pbuf = formatbuf;
8689 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8690 if (len < 0)
8691 goto onError;
8692 break;
8693
8694 default:
8695 PyErr_Format(PyExc_ValueError,
8696 "unsupported format character '%c' (0x%x) "
8697 "at index %zd",
8698 (31<=c && c<=126) ? (char)c : '?',
8699 (int)c,
8700 (Py_ssize_t)(fmt - 1 -
8701 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008702 goto onError;
8703 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008704 if (sign) {
8705 if (*pbuf == '-' || *pbuf == '+') {
8706 sign = *pbuf++;
8707 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008708 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008709 else if (flags & F_SIGN)
8710 sign = '+';
8711 else if (flags & F_BLANK)
8712 sign = ' ';
8713 else
8714 sign = 0;
8715 }
8716 if (width < len)
8717 width = len;
8718 if (rescnt - (sign != 0) < width) {
8719 reslen -= rescnt;
8720 rescnt = width + fmtcnt + 100;
8721 reslen += rescnt;
8722 if (reslen < 0) {
8723 Py_XDECREF(temp);
8724 PyErr_NoMemory();
8725 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008726 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008727 if (_PyUnicode_Resize(&result, reslen) < 0) {
8728 Py_XDECREF(temp);
8729 goto onError;
8730 }
8731 res = PyUnicode_AS_UNICODE(result)
8732 + reslen - rescnt;
8733 }
8734 if (sign) {
8735 if (fill != ' ')
8736 *res++ = sign;
8737 rescnt--;
8738 if (width > len)
8739 width--;
8740 }
8741 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8742 assert(pbuf[0] == '0');
8743 assert(pbuf[1] == c);
8744 if (fill != ' ') {
8745 *res++ = *pbuf++;
8746 *res++ = *pbuf++;
8747 }
8748 rescnt -= 2;
8749 width -= 2;
8750 if (width < 0)
8751 width = 0;
8752 len -= 2;
8753 }
8754 if (width > len && !(flags & F_LJUST)) {
8755 do {
8756 --rescnt;
8757 *res++ = fill;
8758 } while (--width > len);
8759 }
8760 if (fill == ' ') {
8761 if (sign)
8762 *res++ = sign;
8763 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8764 assert(pbuf[0] == '0');
8765 assert(pbuf[1] == c);
8766 *res++ = *pbuf++;
8767 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008768 }
8769 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008770 Py_UNICODE_COPY(res, pbuf, len);
8771 res += len;
8772 rescnt -= len;
8773 while (--width >= len) {
8774 --rescnt;
8775 *res++ = ' ';
8776 }
8777 if (dict && (argidx < arglen) && c != '%') {
8778 PyErr_SetString(PyExc_TypeError,
8779 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008780 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008781 goto onError;
8782 }
8783 Py_XDECREF(temp);
8784 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785 } /* until end */
8786 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008787 PyErr_SetString(PyExc_TypeError,
8788 "not all arguments converted during string formatting");
8789 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790 }
8791
Thomas Woutersa96affe2006-03-12 00:29:36 +00008792 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008793 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008795 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 }
8797 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 return (PyObject *)result;
8799
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008800 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 Py_XDECREF(result);
8802 Py_DECREF(uformat);
8803 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008804 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805 }
8806 return NULL;
8807}
8808
8809static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008810 (readbufferproc) unicode_buffer_getreadbuf,
8811 (writebufferproc) unicode_buffer_getwritebuf,
8812 (segcountproc) unicode_buffer_getsegcount,
8813 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814};
8815
Jeremy Hylton938ace62002-07-17 16:30:39 +00008816static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008817unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8818
Tim Peters6d6c1a32001-08-02 04:15:00 +00008819static PyObject *
8820unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8821{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008822 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008823 static char *kwlist[] = {"string", "encoding", "errors", 0};
8824 char *encoding = NULL;
8825 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008826
Benjamin Peterson857ce152009-01-31 16:29:18 +00008827 if (type != &PyUnicode_Type)
8828 return unicode_subtype_new(type, args, kwds);
8829 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008830 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008831 return NULL;
8832 if (x == NULL)
8833 return (PyObject *)_PyUnicode_New(0);
8834 if (encoding == NULL && errors == NULL)
8835 return PyObject_Unicode(x);
8836 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008837 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008838}
8839
Guido van Rossume023fe02001-08-30 03:12:59 +00008840static PyObject *
8841unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8842{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008843 PyUnicodeObject *tmp, *pnew;
8844 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008845
Benjamin Peterson857ce152009-01-31 16:29:18 +00008846 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8847 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8848 if (tmp == NULL)
8849 return NULL;
8850 assert(PyUnicode_Check(tmp));
8851 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8852 if (pnew == NULL) {
8853 Py_DECREF(tmp);
8854 return NULL;
8855 }
8856 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8857 if (pnew->str == NULL) {
8858 _Py_ForgetReference((PyObject *)pnew);
8859 PyObject_Del(pnew);
8860 Py_DECREF(tmp);
8861 return PyErr_NoMemory();
8862 }
8863 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8864 pnew->length = n;
8865 pnew->hash = tmp->hash;
8866 Py_DECREF(tmp);
8867 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008868}
8869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008870PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008871 "unicode(object='') -> unicode object\n\
8872unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008873\n\
8874Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008875encoding defaults to the current default string encoding.\n\
8876errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008877
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008879 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008880 "unicode", /* tp_name */
8881 sizeof(PyUnicodeObject), /* tp_size */
8882 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008884 (destructor)unicode_dealloc, /* tp_dealloc */
8885 0, /* tp_print */
8886 0, /* tp_getattr */
8887 0, /* tp_setattr */
8888 0, /* tp_compare */
8889 unicode_repr, /* tp_repr */
8890 &unicode_as_number, /* tp_as_number */
8891 &unicode_as_sequence, /* tp_as_sequence */
8892 &unicode_as_mapping, /* tp_as_mapping */
8893 (hashfunc) unicode_hash, /* tp_hash*/
8894 0, /* tp_call*/
8895 (reprfunc) unicode_str, /* tp_str */
8896 PyObject_GenericGetAttr, /* tp_getattro */
8897 0, /* tp_setattro */
8898 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008899 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008900 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008901 unicode_doc, /* tp_doc */
8902 0, /* tp_traverse */
8903 0, /* tp_clear */
8904 PyUnicode_RichCompare, /* tp_richcompare */
8905 0, /* tp_weaklistoffset */
8906 0, /* tp_iter */
8907 0, /* tp_iternext */
8908 unicode_methods, /* tp_methods */
8909 0, /* tp_members */
8910 0, /* tp_getset */
8911 &PyBaseString_Type, /* tp_base */
8912 0, /* tp_dict */
8913 0, /* tp_descr_get */
8914 0, /* tp_descr_set */
8915 0, /* tp_dictoffset */
8916 0, /* tp_init */
8917 0, /* tp_alloc */
8918 unicode_new, /* tp_new */
8919 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920};
8921
8922/* Initialize the Unicode implementation */
8923
Thomas Wouters78890102000-07-22 19:25:51 +00008924void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008926 /* XXX - move this array to unicodectype.c ? */
8927 Py_UNICODE linebreak[] = {
8928 0x000A, /* LINE FEED */
8929 0x000D, /* CARRIAGE RETURN */
8930 0x001C, /* FILE SEPARATOR */
8931 0x001D, /* GROUP SEPARATOR */
8932 0x001E, /* RECORD SEPARATOR */
8933 0x0085, /* NEXT LINE */
8934 0x2028, /* LINE SEPARATOR */
8935 0x2029, /* PARAGRAPH SEPARATOR */
8936 };
8937
Fred Drakee4315f52000-05-09 19:53:39 +00008938 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008939 if (!unicode_empty) {
8940 unicode_empty = _PyUnicode_New(0);
8941 if (!unicode_empty)
8942 return;
8943 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008944
Guido van Rossumcacfc072002-05-24 19:01:59 +00008945 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008946 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008947
8948 /* initialize the linebreak bloom filter */
8949 bloom_linebreak = make_bloom_mask(
8950 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8951 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008952
8953 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008954
8955 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8956 Py_FatalError("Can't initialize field name iterator type");
8957
8958 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8959 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960}
8961
8962/* Finalize the Unicode implementation */
8963
Christian Heimes3b718a72008-02-14 12:47:33 +00008964int
8965PyUnicode_ClearFreeList(void)
8966{
8967 int freelist_size = numfree;
8968 PyUnicodeObject *u;
8969
8970 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008971 PyUnicodeObject *v = u;
8972 u = *(PyUnicodeObject **)u;
8973 if (v->str)
8974 PyObject_DEL(v->str);
8975 Py_XDECREF(v->defenc);
8976 PyObject_Del(v);
8977 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008978 }
8979 free_list = NULL;
8980 assert(numfree == 0);
8981 return freelist_size;
8982}
8983
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984void
Thomas Wouters78890102000-07-22 19:25:51 +00008985_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008987 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008989 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008990
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008991 for (i = 0; i < 256; i++)
8992 Py_CLEAR(unicode_latin1[i]);
8993
Christian Heimes3b718a72008-02-14 12:47:33 +00008994 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008996
Anthony Baxterac6bd462006-04-13 02:06:09 +00008997#ifdef __cplusplus
8998}
8999#endif