blob: d011f7d50c2b13e18e74f5862373496c61324a6e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
Martin Panter646b5282016-06-21 23:58:05 +0000350 (void)PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
Serhiy Storchaka763a61c2016-04-10 18:05:12 +0300439 Py_SETREF(*unicode, w);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000440 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000441 }
442
443 /* Note that we don't have to modify *unicode for unshared Unicode
444 objects, since we can modify them in-place. */
445 return unicode_resize(v, length);
446}
447
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000448int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
449{
450 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
451}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000452
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000454 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455{
456 PyUnicodeObject *unicode;
457
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000458 /* If the Unicode data is known at construction time, we can apply
459 some optimizations which share commonly used objects. */
460 if (u != NULL) {
461
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000462 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200463 if (size == 0)
464 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000465
466 /* Single character Unicode objects in the Latin-1 range are
467 shared when using this constructor */
468 if (size == 1 && *u < 256) {
469 unicode = unicode_latin1[*u];
470 if (!unicode) {
471 unicode = _PyUnicode_New(1);
472 if (!unicode)
473 return NULL;
474 unicode->str[0] = *u;
475 unicode_latin1[*u] = unicode;
476 }
477 Py_INCREF(unicode);
478 return (PyObject *)unicode;
479 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000480 }
Tim Petersced69f82003-09-16 20:30:58 +0000481
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482 unicode = _PyUnicode_New(size);
483 if (!unicode)
484 return NULL;
485
486 /* Copy the Unicode data into the new object */
487 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000488 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489
490 return (PyObject *)unicode;
491}
492
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000493PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
494{
495 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000496
Benjamin Peterson857ce152009-01-31 16:29:18 +0000497 if (size < 0) {
498 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000499 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000500 return NULL;
501 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000502
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000503 /* If the Unicode data is known at construction time, we can apply
504 some optimizations which share commonly used objects.
505 Also, this means the input must be UTF-8, so fall back to the
506 UTF-8 decoder at the end. */
507 if (u != NULL) {
508
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000509 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200510 if (size == 0)
511 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000512
513 /* Single characters are shared when using this constructor.
514 Restrict to ASCII, since the input must be UTF-8. */
515 if (size == 1 && Py_CHARMASK(*u) < 128) {
516 unicode = unicode_latin1[Py_CHARMASK(*u)];
517 if (!unicode) {
518 unicode = _PyUnicode_New(1);
519 if (!unicode)
520 return NULL;
521 unicode->str[0] = Py_CHARMASK(*u);
522 unicode_latin1[Py_CHARMASK(*u)] = unicode;
523 }
524 Py_INCREF(unicode);
525 return (PyObject *)unicode;
526 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000527
528 return PyUnicode_DecodeUTF8(u, size, NULL);
529 }
530
531 unicode = _PyUnicode_New(size);
532 if (!unicode)
533 return NULL;
534
535 return (PyObject *)unicode;
536}
537
538PyObject *PyUnicode_FromString(const char *u)
539{
540 size_t size = strlen(u);
541 if (size > PY_SSIZE_T_MAX) {
542 PyErr_SetString(PyExc_OverflowError, "input too long");
543 return NULL;
544 }
545
546 return PyUnicode_FromStringAndSize(u, size);
547}
548
Serhiy Storchakae822b032013-08-06 16:56:26 +0300549/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
550 * by 'ptr', possibly combining surrogate pairs on narrow builds.
551 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
552 * that should be returned and 'end' pointing to the end of the buffer.
553 * ('end' is used on narrow builds to detect a lone surrogate at the
554 * end of the buffer that should be returned unchanged.)
555 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
556 * The type of the returned char is always Py_UCS4.
557 *
558 * Note: the macro advances ptr to next char, so it might have side-effects
559 * (especially if used with other macros).
560 */
561
562/* helper macros used by _Py_UNICODE_NEXT */
563#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
564#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
565/* Join two surrogate characters and return a single Py_UCS4 value. */
566#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
567 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
568 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
569
570#ifdef Py_UNICODE_WIDE
571#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
572#else
573#define _Py_UNICODE_NEXT(ptr, end) \
574 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
575 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
576 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
577 (Py_UCS4)*(ptr)++)
578#endif
579
Guido van Rossumd57fd912000-03-10 22:53:23 +0000580#ifdef HAVE_WCHAR_H
581
Mark Dickinson6b265f12009-03-18 16:07:26 +0000582#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
583# define CONVERT_WCHAR_TO_SURROGATES
584#endif
585
586#ifdef CONVERT_WCHAR_TO_SURROGATES
587
588/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
589 to convert from UTF32 to UTF16. */
590
591PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
592 Py_ssize_t size)
593{
594 PyUnicodeObject *unicode;
595 register Py_ssize_t i;
596 Py_ssize_t alloc;
597 const wchar_t *orig_w;
598
599 if (w == NULL) {
600 PyErr_BadInternalCall();
601 return NULL;
602 }
603
604 alloc = size;
605 orig_w = w;
606 for (i = size; i > 0; i--) {
607 if (*w > 0xFFFF)
608 alloc++;
609 w++;
610 }
611 w = orig_w;
612 unicode = _PyUnicode_New(alloc);
613 if (!unicode)
614 return NULL;
615
616 /* Copy the wchar_t data into the new object */
617 {
618 register Py_UNICODE *u;
619 u = PyUnicode_AS_UNICODE(unicode);
620 for (i = size; i > 0; i--) {
621 if (*w > 0xFFFF) {
622 wchar_t ordinal = *w++;
623 ordinal -= 0x10000;
624 *u++ = 0xD800 | (ordinal >> 10);
625 *u++ = 0xDC00 | (ordinal & 0x3FF);
626 }
627 else
628 *u++ = *w++;
629 }
630 }
631 return (PyObject *)unicode;
632}
633
634#else
635
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000637 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638{
639 PyUnicodeObject *unicode;
640
641 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000642 PyErr_BadInternalCall();
643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644 }
645
646 unicode = _PyUnicode_New(size);
647 if (!unicode)
648 return NULL;
649
650 /* Copy the wchar_t data into the new object */
651#ifdef HAVE_USABLE_WCHAR_T
652 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000653#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000655 register Py_UNICODE *u;
656 register Py_ssize_t i;
657 u = PyUnicode_AS_UNICODE(unicode);
658 for (i = size; i > 0; i--)
659 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000660 }
661#endif
662
663 return (PyObject *)unicode;
664}
665
Mark Dickinson6b265f12009-03-18 16:07:26 +0000666#endif /* CONVERT_WCHAR_TO_SURROGATES */
667
668#undef CONVERT_WCHAR_TO_SURROGATES
669
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000670static void
671makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
672{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000673 *fmt++ = '%';
674 if (width) {
675 if (zeropad)
676 *fmt++ = '0';
677 fmt += sprintf(fmt, "%d", width);
678 }
679 if (precision)
680 fmt += sprintf(fmt, ".%d", precision);
681 if (longflag)
682 *fmt++ = 'l';
683 else if (size_tflag) {
684 char *f = PY_FORMAT_SIZE_T;
685 while (*f)
686 *fmt++ = *f++;
687 }
688 *fmt++ = c;
689 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000690}
691
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200692#define appendstring(string) \
693 do { \
694 for (copy = string;*copy; copy++) { \
695 *s++ = (unsigned char)*copy; \
696 } \
697 } while (0)
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000698
699PyObject *
700PyUnicode_FromFormatV(const char *format, va_list vargs)
701{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000702 va_list count;
703 Py_ssize_t callcount = 0;
704 PyObject **callresults = NULL;
705 PyObject **callresult = NULL;
706 Py_ssize_t n = 0;
707 int width = 0;
708 int precision = 0;
709 int zeropad;
710 const char* f;
711 Py_UNICODE *s;
712 PyObject *string;
713 /* used by sprintf */
714 char buffer[21];
715 /* use abuffer instead of buffer, if we need more space
716 * (which can happen if there's a format specifier with width). */
717 char *abuffer = NULL;
718 char *realbuffer;
719 Py_ssize_t abuffersize = 0;
720 char fmt[60]; /* should be enough for %0width.precisionld */
721 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000722
723#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000724 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000725#else
726#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000727 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000728#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000729 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000730#endif
731#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000732 /* step 1: count the number of %S/%R/%s format specifications
733 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
734 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000735 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000736 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200737 f++;
738 while (*f && *f != '%' && !isalpha((unsigned)*f))
739 f++;
Serhiy Storchaka227526d2015-01-31 01:15:29 +0200740 if (!*f)
741 break;
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200742 if (*f == 's' || *f=='S' || *f=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000743 ++callcount;
744 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000745 }
746 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000747 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000748 if (callcount) {
749 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
750 if (!callresults) {
751 PyErr_NoMemory();
752 return NULL;
753 }
754 callresult = callresults;
755 }
756 /* step 3: figure out how large a buffer we need */
757 for (f = format; *f; f++) {
758 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200759 const char* p = f++;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000760 width = 0;
761 while (isdigit((unsigned)*f))
762 width = (width*10) + *f++ - '0';
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200763 precision = 0;
764 if (*f == '.') {
765 f++;
766 while (isdigit((unsigned)*f))
767 precision = (precision*10) + *f++ - '0';
768 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000769
Benjamin Peterson857ce152009-01-31 16:29:18 +0000770 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
771 * they don't affect the amount of space we reserve.
772 */
773 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000774 (f[1] == 'd' || f[1] == 'u'))
775 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000776
Benjamin Peterson857ce152009-01-31 16:29:18 +0000777 switch (*f) {
778 case 'c':
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300779 {
780 int ordinal = va_arg(count, int);
781#ifdef Py_UNICODE_WIDE
782 if (ordinal < 0 || ordinal > 0x10ffff) {
783 PyErr_SetString(PyExc_OverflowError,
784 "%c arg not in range(0x110000) "
785 "(wide Python build)");
786 goto fail;
787 }
788#else
789 if (ordinal < 0 || ordinal > 0xffff) {
790 PyErr_SetString(PyExc_OverflowError,
791 "%c arg not in range(0x10000) "
792 "(narrow Python build)");
793 goto fail;
794 }
795#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000796 /* fall through... */
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300797 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000798 case '%':
799 n++;
800 break;
801 case 'd': case 'u': case 'i': case 'x':
802 (void) va_arg(count, int);
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200803 if (width < precision)
804 width = precision;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000805 /* 20 bytes is enough to hold a 64-bit
806 integer. Decimal takes the most space.
807 This isn't enough for octal.
808 If a width is specified we need more
809 (which we allocate later). */
810 if (width < 20)
811 width = 20;
812 n += width;
813 if (abuffersize < width)
814 abuffersize = width;
815 break;
816 case 's':
817 {
818 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000819 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000820 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
821 if (!str)
822 goto fail;
823 n += PyUnicode_GET_SIZE(str);
824 /* Remember the str and switch to the next slot */
825 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000826 break;
827 }
828 case 'U':
829 {
830 PyObject *obj = va_arg(count, PyObject *);
831 assert(obj && PyUnicode_Check(obj));
832 n += PyUnicode_GET_SIZE(obj);
833 break;
834 }
835 case 'V':
836 {
837 PyObject *obj = va_arg(count, PyObject *);
838 const char *str = va_arg(count, const char *);
839 assert(obj || str);
840 assert(!obj || PyUnicode_Check(obj));
841 if (obj)
842 n += PyUnicode_GET_SIZE(obj);
843 else
844 n += strlen(str);
845 break;
846 }
847 case 'S':
848 {
849 PyObject *obj = va_arg(count, PyObject *);
850 PyObject *str;
851 assert(obj);
852 str = PyObject_Str(obj);
853 if (!str)
854 goto fail;
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200855 n += PyString_GET_SIZE(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000856 /* Remember the str and switch to the next slot */
857 *callresult++ = str;
858 break;
859 }
860 case 'R':
861 {
862 PyObject *obj = va_arg(count, PyObject *);
863 PyObject *repr;
864 assert(obj);
865 repr = PyObject_Repr(obj);
866 if (!repr)
867 goto fail;
868 n += PyUnicode_GET_SIZE(repr);
869 /* Remember the repr and switch to the next slot */
870 *callresult++ = repr;
871 break;
872 }
873 case 'p':
874 (void) va_arg(count, int);
875 /* maximum 64-bit pointer representation:
876 * 0xffffffffffffffff
877 * so 19 characters is enough.
878 * XXX I count 18 -- what's the extra for?
879 */
880 n += 19;
881 break;
882 default:
883 /* if we stumble upon an unknown
884 formatting code, copy the rest of
885 the format string to the output
886 string. (we cannot just skip the
887 code, since there's no way to know
888 what's in the argument list) */
889 n += strlen(p);
890 goto expand;
891 }
892 } else
893 n++;
894 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000895 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000896 if (abuffersize > 20) {
Serhiy Storchaka5ec0bbf2015-01-30 23:35:03 +0200897 /* add 1 for sprintf's trailing null byte */
898 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000899 if (!abuffer) {
900 PyErr_NoMemory();
901 goto fail;
902 }
903 realbuffer = abuffer;
904 }
905 else
906 realbuffer = buffer;
907 /* step 4: fill the buffer */
908 /* Since we've analyzed how much space we need for the worst case,
909 we don't have to resize the string.
910 There can be no errors beyond this point. */
911 string = PyUnicode_FromUnicode(NULL, n);
912 if (!string)
913 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000914
Benjamin Peterson857ce152009-01-31 16:29:18 +0000915 s = PyUnicode_AS_UNICODE(string);
916 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000917
Benjamin Peterson857ce152009-01-31 16:29:18 +0000918 for (f = format; *f; f++) {
919 if (*f == '%') {
920 const char* p = f++;
921 int longflag = 0;
922 int size_tflag = 0;
923 zeropad = (*f == '0');
924 /* parse the width.precision part */
925 width = 0;
926 while (isdigit((unsigned)*f))
927 width = (width*10) + *f++ - '0';
928 precision = 0;
929 if (*f == '.') {
930 f++;
931 while (isdigit((unsigned)*f))
932 precision = (precision*10) + *f++ - '0';
933 }
934 /* handle the long flag, but only for %ld and %lu.
935 others can be added when necessary. */
936 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
937 longflag = 1;
938 ++f;
939 }
940 /* handle the size_t flag. */
941 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
942 size_tflag = 1;
943 ++f;
944 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000945
Benjamin Peterson857ce152009-01-31 16:29:18 +0000946 switch (*f) {
947 case 'c':
948 *s++ = va_arg(vargs, int);
949 break;
950 case 'd':
951 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
952 if (longflag)
953 sprintf(realbuffer, fmt, va_arg(vargs, long));
954 else if (size_tflag)
955 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
956 else
957 sprintf(realbuffer, fmt, va_arg(vargs, int));
958 appendstring(realbuffer);
959 break;
960 case 'u':
961 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
962 if (longflag)
963 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
964 else if (size_tflag)
965 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
966 else
967 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
968 appendstring(realbuffer);
969 break;
970 case 'i':
971 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
972 sprintf(realbuffer, fmt, va_arg(vargs, int));
973 appendstring(realbuffer);
974 break;
975 case 'x':
976 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
977 sprintf(realbuffer, fmt, va_arg(vargs, int));
978 appendstring(realbuffer);
979 break;
980 case 's':
981 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000982 /* unused, since we already have the result */
983 (void) va_arg(vargs, char *);
984 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
985 PyUnicode_GET_SIZE(*callresult));
986 s += PyUnicode_GET_SIZE(*callresult);
987 /* We're done with the unicode()/repr() => forget it */
988 Py_DECREF(*callresult);
989 /* switch to next unicode()/repr() result */
990 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000991 break;
992 }
993 case 'U':
994 {
995 PyObject *obj = va_arg(vargs, PyObject *);
996 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
997 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
998 s += size;
999 break;
1000 }
1001 case 'V':
1002 {
1003 PyObject *obj = va_arg(vargs, PyObject *);
1004 const char *str = va_arg(vargs, const char *);
1005 if (obj) {
1006 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1007 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1008 s += size;
1009 } else {
1010 appendstring(str);
1011 }
1012 break;
1013 }
1014 case 'S':
1015 case 'R':
1016 {
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001017 const char *str = PyString_AS_STRING(*callresult);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001018 /* unused, since we already have the result */
1019 (void) va_arg(vargs, PyObject *);
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001020 appendstring(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001021 /* We're done with the unicode()/repr() => forget it */
1022 Py_DECREF(*callresult);
1023 /* switch to next unicode()/repr() result */
1024 ++callresult;
1025 break;
1026 }
1027 case 'p':
1028 sprintf(buffer, "%p", va_arg(vargs, void*));
1029 /* %p is ill-defined: ensure leading 0x. */
1030 if (buffer[1] == 'X')
1031 buffer[1] = 'x';
1032 else if (buffer[1] != 'x') {
1033 memmove(buffer+2, buffer, strlen(buffer)+1);
1034 buffer[0] = '0';
1035 buffer[1] = 'x';
1036 }
1037 appendstring(buffer);
1038 break;
1039 case '%':
1040 *s++ = '%';
1041 break;
1042 default:
1043 appendstring(p);
1044 goto end;
1045 }
1046 } else
1047 *s++ = *f;
1048 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001049
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001050 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001051 if (callresults)
1052 PyObject_Free(callresults);
1053 if (abuffer)
1054 PyObject_Free(abuffer);
1055 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1056 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001057 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001058 if (callresults) {
1059 PyObject **callresult2 = callresults;
1060 while (callresult2 < callresult) {
1061 Py_DECREF(*callresult2);
1062 ++callresult2;
1063 }
1064 PyObject_Free(callresults);
1065 }
1066 if (abuffer)
1067 PyObject_Free(abuffer);
1068 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001069}
1070
1071#undef appendstring
1072
1073PyObject *
1074PyUnicode_FromFormat(const char *format, ...)
1075{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001076 PyObject* ret;
1077 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001078
1079#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001080 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001081#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001082 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001083#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001084 ret = PyUnicode_FromFormatV(format, vargs);
1085 va_end(vargs);
1086 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001087}
1088
Martin v. Löwis18e16552006-02-15 17:27:45 +00001089Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001090 wchar_t *w,
1091 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092{
1093 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001094 PyErr_BadInternalCall();
1095 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001097
1098 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001100 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001101
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102#ifdef HAVE_USABLE_WCHAR_T
1103 memcpy(w, unicode->str, size * sizeof(wchar_t));
1104#else
1105 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001106 register Py_UNICODE *u;
1107 register Py_ssize_t i;
1108 u = PyUnicode_AS_UNICODE(unicode);
1109 for (i = size; i > 0; i--)
1110 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 }
1112#endif
1113
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001114 if (size > PyUnicode_GET_SIZE(unicode))
1115 return PyUnicode_GET_SIZE(unicode);
1116 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001117 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118}
1119
1120#endif
1121
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122PyObject *PyUnicode_FromOrdinal(int ordinal)
1123{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001124 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001125
1126#ifdef Py_UNICODE_WIDE
1127 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001128 PyErr_SetString(PyExc_ValueError,
1129 "unichr() arg not in range(0x110000) "
1130 "(wide Python build)");
1131 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001132 }
1133#else
1134 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001135 PyErr_SetString(PyExc_ValueError,
1136 "unichr() arg not in range(0x10000) "
1137 "(narrow Python build)");
1138 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001139 }
1140#endif
1141
Hye-Shik Chang40574832004-04-06 07:24:51 +00001142 s[0] = (Py_UNICODE)ordinal;
1143 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001144}
1145
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146PyObject *PyUnicode_FromObject(register PyObject *obj)
1147{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001148 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001149 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001151 Py_INCREF(obj);
1152 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001153 }
1154 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001155 /* For a Unicode subtype that's not a Unicode object,
1156 return a true Unicode object with the same data. */
1157 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1158 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001159 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001160 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1161}
1162
1163PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001164 const char *encoding,
1165 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001167 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001168 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001169 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001170
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001172 PyErr_BadInternalCall();
1173 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001175
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001176#if 0
1177 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001178 that no encodings is given and then redirect to
1179 PyObject_Unicode() which then applies the additional logic for
1180 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001181
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001182 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001183 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001184
1185 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001186 if (PyUnicode_Check(obj)) {
1187 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001188 PyErr_SetString(PyExc_TypeError,
1189 "decoding Unicode is not supported");
1190 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001191 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001192 return PyObject_Unicode(obj);
1193 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001194#else
1195 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001196 PyErr_SetString(PyExc_TypeError,
1197 "decoding Unicode is not supported");
1198 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001199 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001200#endif
1201
1202 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001203 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001204 s = PyString_AS_STRING(obj);
1205 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001206 }
Christian Heimes3497f942008-05-26 12:29:14 +00001207 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001208 /* Python 2.x specific */
1209 PyErr_Format(PyExc_TypeError,
1210 "decoding bytearray is not supported");
1211 return NULL;
1212 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001213 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001214 /* Overwrite the error message with something more useful in
1215 case of a TypeError. */
1216 if (PyErr_ExceptionMatches(PyExc_TypeError))
1217 PyErr_Format(PyExc_TypeError,
1218 "coercing to Unicode: need string or buffer, "
1219 "%.80s found",
1220 Py_TYPE(obj)->tp_name);
1221 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001222 }
Tim Petersced69f82003-09-16 20:30:58 +00001223
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001224 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001225 if (len == 0)
1226 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001227
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001228 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001229 return v;
1230
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001231 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001232 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233}
1234
1235PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001236 Py_ssize_t size,
1237 const char *encoding,
1238 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239{
1240 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001241
1242 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001243 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001244
1245 /* Shortcuts for common default encodings */
1246 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001248 else if (strcmp(encoding, "latin-1") == 0)
1249 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001250#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1251 else if (strcmp(encoding, "mbcs") == 0)
1252 return PyUnicode_DecodeMBCS(s, size, errors);
1253#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001254 else if (strcmp(encoding, "ascii") == 0)
1255 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256
1257 /* Decode via the codec registry */
1258 buffer = PyBuffer_FromMemory((void *)s, size);
1259 if (buffer == NULL)
1260 goto onError;
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001261 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262 if (unicode == NULL)
1263 goto onError;
1264 if (!PyUnicode_Check(unicode)) {
1265 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001266 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001267 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 Py_DECREF(unicode);
1269 goto onError;
1270 }
1271 Py_DECREF(buffer);
1272 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001273
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001274 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 Py_XDECREF(buffer);
1276 return NULL;
1277}
1278
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001279PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1280 const char *encoding,
1281 const char *errors)
1282{
1283 PyObject *v;
1284
1285 if (!PyUnicode_Check(unicode)) {
1286 PyErr_BadArgument();
1287 goto onError;
1288 }
1289
Serhiy Storchakae37003e2015-12-03 20:47:48 +02001290 if (PyErr_WarnPy3k("decoding Unicode is not supported in 3.x", 1) < 0)
1291 goto onError;
1292
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001293 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001294 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001295
1296 /* Decode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001297 v = _PyCodec_DecodeText(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001298 if (v == NULL)
1299 goto onError;
1300 return v;
1301
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001302 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001303 return NULL;
1304}
1305
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001307 Py_ssize_t size,
1308 const char *encoding,
1309 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310{
1311 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001312
Guido van Rossumd57fd912000-03-10 22:53:23 +00001313 unicode = PyUnicode_FromUnicode(s, size);
1314 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001315 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1317 Py_DECREF(unicode);
1318 return v;
1319}
1320
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001321PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1322 const char *encoding,
1323 const char *errors)
1324{
1325 PyObject *v;
1326
1327 if (!PyUnicode_Check(unicode)) {
1328 PyErr_BadArgument();
1329 goto onError;
1330 }
1331
1332 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001333 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001334
1335 /* Encode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001336 v = _PyCodec_EncodeText(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001337 if (v == NULL)
1338 goto onError;
1339 return v;
1340
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001341 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001342 return NULL;
1343}
1344
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1346 const char *encoding,
1347 const char *errors)
1348{
1349 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351 if (!PyUnicode_Check(unicode)) {
1352 PyErr_BadArgument();
1353 goto onError;
1354 }
Fred Drakee4315f52000-05-09 19:53:39 +00001355
Tim Petersced69f82003-09-16 20:30:58 +00001356 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001357 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001358
1359 /* Shortcuts for common default encodings */
1360 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001361 if (strcmp(encoding, "utf-8") == 0)
1362 return PyUnicode_AsUTF8String(unicode);
1363 else if (strcmp(encoding, "latin-1") == 0)
1364 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001365#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001366 else if (strcmp(encoding, "mbcs") == 0)
1367 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001368#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001369 else if (strcmp(encoding, "ascii") == 0)
1370 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001371 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372
1373 /* Encode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001374 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 if (v == NULL)
1376 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001377 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001379 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001380 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 Py_DECREF(v);
1382 goto onError;
1383 }
1384 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001385
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001386 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return NULL;
1388}
1389
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001390PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001391 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001392{
1393 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1394
1395 if (v)
1396 return v;
1397 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1398 if (v && errors == NULL)
1399 ((PyUnicodeObject *)unicode)->defenc = v;
1400 return v;
1401}
1402
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1404{
1405 if (!PyUnicode_Check(unicode)) {
1406 PyErr_BadArgument();
1407 goto onError;
1408 }
1409 return PyUnicode_AS_UNICODE(unicode);
1410
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001411 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 return NULL;
1413}
1414
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416{
1417 if (!PyUnicode_Check(unicode)) {
1418 PyErr_BadArgument();
1419 goto onError;
1420 }
1421 return PyUnicode_GET_SIZE(unicode);
1422
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001423 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 return -1;
1425}
1426
Thomas Wouters78890102000-07-22 19:25:51 +00001427const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001428{
1429 return unicode_default_encoding;
1430}
1431
1432int PyUnicode_SetDefaultEncoding(const char *encoding)
1433{
1434 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001435
Fred Drakee4315f52000-05-09 19:53:39 +00001436 /* Make sure the encoding is valid. As side effect, this also
1437 loads the encoding into the codec registry cache. */
1438 v = _PyCodec_Lookup(encoding);
1439 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001440 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001441 Py_DECREF(v);
1442 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001443 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001444 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001445 return 0;
1446
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001448 return -1;
1449}
1450
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001451/* error handling callback helper:
1452 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001453 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454 and adjust various state variables.
1455 return 0 on success, -1 on error
1456*/
1457
1458static
1459int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001460 const char *encoding, const char *reason,
1461 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1462 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1463 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001464{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001465 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001466
1467 PyObject *restuple = NULL;
1468 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001469 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1470 Py_ssize_t requiredsize;
1471 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001473 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001474 int res = -1;
1475
1476 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001477 *errorHandler = PyCodec_LookupError(errors);
1478 if (*errorHandler == NULL)
1479 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001480 }
1481
1482 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001483 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001484 encoding, input, insize, *startinpos, *endinpos, reason);
1485 if (*exceptionObject == NULL)
1486 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001487 }
1488 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001489 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1490 goto onError;
1491 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1492 goto onError;
1493 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1494 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001495 }
1496
1497 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1498 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001499 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001501 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001502 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001503 }
1504 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001505 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001506 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001507 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001508 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001509 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1510 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001511 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001512
1513 /* need more space? (at least enough for what we
1514 have+the replacement+the rest of the string (starting
1515 at the new input position), so we won't have to check space
1516 when there are no errors in the rest of the string) */
1517 repptr = PyUnicode_AS_UNICODE(repunicode);
1518 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001519 requiredsize = *outpos;
1520 if (requiredsize > PY_SSIZE_T_MAX - repsize)
1521 goto overflow;
1522 requiredsize += repsize;
1523 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1524 goto overflow;
1525 requiredsize += insize - newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001526 if (requiredsize > outsize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001527 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001528 requiredsize = 2*outsize;
1529 if (_PyUnicode_Resize(output, requiredsize) < 0)
1530 goto onError;
1531 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001532 }
1533 *endinpos = newpos;
1534 *inptr = input + newpos;
1535 Py_UNICODE_COPY(*outptr, repptr, repsize);
1536 *outptr += repsize;
1537 *outpos += repsize;
1538 /* we made it! */
1539 res = 0;
1540
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001541 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001542 Py_XDECREF(restuple);
1543 return res;
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001544
1545 overflow:
1546 PyErr_SetString(PyExc_OverflowError,
1547 "decoded result is too long for a Python string");
1548 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549}
1550
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551/* --- UTF-7 Codec -------------------------------------------------------- */
1552
Antoine Pitrou653dece2009-05-04 18:32:32 +00001553/* See RFC2152 for details. We encode conservatively and decode liberally. */
1554
1555/* Three simple macros defining base-64. */
1556
1557/* Is c a base-64 character? */
1558
1559#define IS_BASE64(c) \
Serhiy Storchaka462502b2015-10-10 09:33:11 +03001560 (((c) >= 'A' && (c) <= 'Z') || \
1561 ((c) >= 'a' && (c) <= 'z') || \
1562 ((c) >= '0' && (c) <= '9') || \
1563 (c) == '+' || (c) == '/')
Antoine Pitrou653dece2009-05-04 18:32:32 +00001564
1565/* given that c is a base-64 character, what is its base-64 value? */
1566
1567#define FROM_BASE64(c) \
1568 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1569 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1570 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1571 (c) == '+' ? 62 : 63)
1572
1573/* What is the base-64 character of the bottom 6 bits of n? */
1574
1575#define TO_BASE64(n) \
1576 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1577
1578/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1579 * decoded as itself. We are permissive on decoding; the only ASCII
1580 * byte not decoding to itself is the + which begins a base64
1581 * string. */
1582
1583#define DECODE_DIRECT(c) \
1584 ((c) <= 127 && (c) != '+')
1585
1586/* The UTF-7 encoder treats ASCII characters differently according to
1587 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1588 * the above). See RFC2152. This array identifies these different
1589 * sets:
1590 * 0 : "Set D"
1591 * alphanumeric and '(),-./:?
1592 * 1 : "Set O"
1593 * !"#$%&*;<=>@[]^_`{|}
1594 * 2 : "whitespace"
1595 * ht nl cr sp
1596 * 3 : special (must be base64 encoded)
1597 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1598 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001599
Tim Petersced69f82003-09-16 20:30:58 +00001600static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001601char utf7_category[128] = {
1602/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1603 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1604/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1605 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1606/* sp ! " # $ % & ' ( ) * + , - . / */
1607 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1608/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1609 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1610/* @ A B C D E F G H I J K L M N O */
1611 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1612/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1613 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1614/* ` a b c d e f g h i j k l m n o */
1615 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1616/* p q r s t u v w x y z { | } ~ del */
1617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001618};
1619
Antoine Pitrou653dece2009-05-04 18:32:32 +00001620/* ENCODE_DIRECT: this character should be encoded as itself. The
1621 * answer depends on whether we are encoding set O as itself, and also
1622 * on whether we are encoding whitespace as itself. RFC2152 makes it
1623 * clear that the answers to these questions vary between
1624 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001625
Antoine Pitrou653dece2009-05-04 18:32:32 +00001626#define ENCODE_DIRECT(c, directO, directWS) \
1627 ((c) < 128 && (c) > 0 && \
1628 ((utf7_category[(c)] == 0) || \
1629 (directWS && (utf7_category[(c)] == 2)) || \
1630 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001631
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001632PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001633 Py_ssize_t size,
1634 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001636 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1637}
1638
Antoine Pitrou653dece2009-05-04 18:32:32 +00001639/* The decoder. The only state we preserve is our read position,
1640 * i.e. how many characters we have consumed. So if we end in the
1641 * middle of a shift sequence we have to back off the read position
1642 * and the output to the beginning of the sequence, otherwise we lose
1643 * all the shift state (seen bits, number of bits seen, high
1644 * surrogate). */
1645
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001646PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001647 Py_ssize_t size,
1648 const char *errors,
1649 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001650{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001651 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001652 Py_ssize_t startinpos;
1653 Py_ssize_t endinpos;
1654 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001655 const char *e;
1656 PyUnicodeObject *unicode;
1657 Py_UNICODE *p;
1658 const char *errmsg = "";
1659 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 Py_UNICODE *shiftOutStart;
1661 unsigned int base64bits = 0;
1662 unsigned long base64buffer = 0;
1663 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 PyObject *errorHandler = NULL;
1665 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666
1667 unicode = _PyUnicode_New(size);
1668 if (!unicode)
1669 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001670 if (size == 0) {
1671 if (consumed)
1672 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001674 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675
1676 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001677 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001678 e = s + size;
1679
1680 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001681 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001682
Antoine Pitrou653dece2009-05-04 18:32:32 +00001683 if (inShift) { /* in a base-64 section */
1684 if (IS_BASE64(ch)) { /* consume a base-64 character */
1685 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1686 base64bits += 6;
1687 s++;
1688 if (base64bits >= 16) {
1689 /* we have enough bits for a UTF-16 value */
1690 Py_UNICODE outCh = (Py_UNICODE)
1691 (base64buffer >> (base64bits-16));
1692 base64bits -= 16;
1693 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001694 assert(outCh <= 0xffff);
Antoine Pitrou653dece2009-05-04 18:32:32 +00001695 if (surrogate) {
1696 /* expecting a second surrogate */
1697 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1698#ifdef Py_UNICODE_WIDE
1699 *p++ = (((surrogate & 0x3FF)<<10)
1700 | (outCh & 0x3FF)) + 0x10000;
1701#else
1702 *p++ = surrogate;
1703 *p++ = outCh;
1704#endif
1705 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001706 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001707 }
1708 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001709 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001710 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001711 }
1712 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001713 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001714 /* first surrogate */
1715 surrogate = outCh;
1716 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001717 else {
1718 *p++ = outCh;
1719 }
1720 }
1721 }
1722 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001723 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001724 if (base64bits > 0) { /* left-over bits */
1725 if (base64bits >= 6) {
1726 /* We've seen at least one base-64 character */
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001727 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001728 errmsg = "partial character in shift sequence";
1729 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001731 else {
1732 /* Some bits remain; they should be zero */
1733 if (base64buffer != 0) {
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001734 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001735 errmsg = "non-zero padding bits in shift sequence";
1736 goto utf7Error;
1737 }
1738 }
1739 }
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001740 if (surrogate && DECODE_DIRECT(ch))
1741 *p++ = surrogate;
1742 surrogate = 0;
1743 if (ch == '-') {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001744 /* '-' is absorbed; other terminating
1745 characters are preserved */
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001746 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001747 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001748 }
1749 }
1750 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001751 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001752 s++; /* consume '+' */
1753 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001754 s++;
1755 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001756 }
1757 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001758 inShift = 1;
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001759 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001760 shiftOutStart = p;
1761 base64bits = 0;
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001762 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001763 }
1764 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001765 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001766 *p++ = ch;
1767 s++;
1768 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001769 else {
1770 startinpos = s-starts;
1771 s++;
1772 errmsg = "unexpected special character";
1773 goto utf7Error;
1774 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001775 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001776utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001777 outpos = p-PyUnicode_AS_UNICODE(unicode);
1778 endinpos = s-starts;
1779 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001780 errors, &errorHandler,
1781 "utf7", errmsg,
1782 starts, size, &startinpos, &endinpos, &exc, &s,
1783 &unicode, &outpos, &p))
1784 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001785 }
1786
Antoine Pitrou653dece2009-05-04 18:32:32 +00001787 /* end of string */
1788
1789 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1790 /* if we're in an inconsistent state, that's an error */
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001791 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001792 if (surrogate ||
1793 (base64bits >= 6) ||
1794 (base64bits > 0 && base64buffer != 0)) {
1795 outpos = p-PyUnicode_AS_UNICODE(unicode);
1796 endinpos = size;
1797 if (unicode_decode_call_errorhandler(
1798 errors, &errorHandler,
1799 "utf7", "unterminated shift sequence",
1800 starts, size, &startinpos, &endinpos, &exc, &s,
1801 &unicode, &outpos, &p))
1802 goto onError;
1803 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001804 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001805
1806 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001807 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001808 if (inShift) {
1809 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001810 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001811 }
1812 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001813 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001814 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001815 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001816
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001817 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001818 goto onError;
1819
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001820 Py_XDECREF(errorHandler);
1821 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001822 return (PyObject *)unicode;
1823
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001824 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001825 Py_XDECREF(errorHandler);
1826 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001827 Py_DECREF(unicode);
1828 return NULL;
1829}
1830
1831
1832PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001833 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001834 int base64SetO,
1835 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001836 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001837{
1838 PyObject *v;
1839 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001840 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001841 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001842 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001843 unsigned int base64bits = 0;
1844 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001845 char * out;
1846 char * start;
1847
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001848 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001849 return PyErr_NoMemory();
1850
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001851 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001852 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853
Antoine Pitrou653dece2009-05-04 18:32:32 +00001854 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001855 if (v == NULL)
1856 return NULL;
1857
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001858 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001859 for (;i < size; ++i) {
1860 Py_UNICODE ch = s[i];
1861
Antoine Pitrou653dece2009-05-04 18:32:32 +00001862 if (inShift) {
1863 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1864 /* shifting out */
1865 if (base64bits) { /* output remaining bits */
1866 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1867 base64buffer = 0;
1868 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001869 }
1870 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001871 /* Characters not in the BASE64 set implicitly unshift the sequence
1872 so no '-' is required, except if the character is itself a '-' */
1873 if (IS_BASE64(ch) || ch == '-') {
1874 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001875 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001876 *out++ = (char) ch;
1877 }
1878 else {
1879 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001880 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001881 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001882 else { /* not in a shift sequence */
1883 if (ch == '+') {
1884 *out++ = '+';
1885 *out++ = '-';
1886 }
1887 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1888 *out++ = (char) ch;
1889 }
1890 else {
1891 *out++ = '+';
1892 inShift = 1;
1893 goto encode_char;
1894 }
1895 }
1896 continue;
1897encode_char:
1898#ifdef Py_UNICODE_WIDE
1899 if (ch >= 0x10000) {
1900 /* code first surrogate */
1901 base64bits += 16;
1902 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1903 while (base64bits >= 6) {
1904 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1905 base64bits -= 6;
1906 }
1907 /* prepare second surrogate */
1908 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1909 }
1910#endif
1911 base64bits += 16;
1912 base64buffer = (base64buffer << 16) | ch;
1913 while (base64bits >= 6) {
1914 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1915 base64bits -= 6;
1916 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001917 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001918 if (base64bits)
1919 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1920 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001921 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001922
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001923 if (_PyString_Resize(&v, out - start))
1924 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001925 return v;
1926}
1927
Antoine Pitrou653dece2009-05-04 18:32:32 +00001928#undef IS_BASE64
1929#undef FROM_BASE64
1930#undef TO_BASE64
1931#undef DECODE_DIRECT
1932#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001933
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934/* --- UTF-8 Codec -------------------------------------------------------- */
1935
Tim Petersced69f82003-09-16 20:30:58 +00001936static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001938 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1939 illegal prefix. See RFC 3629 for details */
1940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1941 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1944 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1945 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1946 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001947 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1948 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1950 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001951 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1952 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1953 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1954 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1955 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956};
1957
Guido van Rossumd57fd912000-03-10 22:53:23 +00001958PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001959 Py_ssize_t size,
1960 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961{
Walter Dörwald69652032004-09-07 20:24:22 +00001962 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1963}
1964
1965PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001966 Py_ssize_t size,
1967 const char *errors,
1968 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001969{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001970 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001972 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001973 Py_ssize_t startinpos;
1974 Py_ssize_t endinpos;
1975 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976 const char *e;
1977 PyUnicodeObject *unicode;
1978 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001979 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001980 PyObject *errorHandler = NULL;
1981 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982
1983 /* Note: size will always be longer than the resulting Unicode
1984 character count */
1985 unicode = _PyUnicode_New(size);
1986 if (!unicode)
1987 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001988 if (size == 0) {
1989 if (consumed)
1990 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001992 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993
1994 /* Unpack UTF-8 encoded data */
1995 p = unicode->str;
1996 e = s + size;
1997
1998 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001999 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000
2001 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002002 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003 s++;
2004 continue;
2005 }
2006
2007 n = utf8_code_length[ch];
2008
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002009 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002010 if (consumed)
2011 break;
2012 else {
2013 errmsg = "unexpected end of data";
2014 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002015 endinpos = startinpos+1;
2016 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2017 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002018 goto utf8Error;
2019 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021
2022 switch (n) {
2023
2024 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002025 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002026 startinpos = s-starts;
2027 endinpos = startinpos+1;
2028 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029
2030 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002031 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002032 startinpos = s-starts;
2033 endinpos = startinpos+1;
2034 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035
2036 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002037 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00002038 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002039 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002040 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002041 goto utf8Error;
2042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002044 assert ((ch > 0x007F) && (ch <= 0x07FF));
2045 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 break;
2047
2048 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002049 /* XXX: surrogates shouldn't be valid UTF-8!
2050 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2051 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2052 Uncomment the 2 lines below to make them invalid,
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002053 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00002054 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002055 (s[2] & 0xc0) != 0x80 ||
2056 ((unsigned char)s[0] == 0xE0 &&
2057 (unsigned char)s[1] < 0xA0)/* ||
2058 ((unsigned char)s[0] == 0xED &&
2059 (unsigned char)s[1] > 0x9F)*/) {
2060 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002061 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002062 endinpos = startinpos + 1;
2063
2064 /* if s[1] first two bits are 1 and 0, then the invalid
2065 continuation byte is s[2], so increment endinpos by 1,
2066 if not, s[1] is invalid and endinpos doesn't need to
2067 be incremented. */
2068 if ((s[1] & 0xC0) == 0x80)
2069 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002070 goto utf8Error;
2071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002073 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2074 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002075 break;
2076
2077 case 4:
2078 if ((s[1] & 0xc0) != 0x80 ||
2079 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002080 (s[3] & 0xc0) != 0x80 ||
2081 ((unsigned char)s[0] == 0xF0 &&
2082 (unsigned char)s[1] < 0x90) ||
2083 ((unsigned char)s[0] == 0xF4 &&
2084 (unsigned char)s[1] > 0x8F)) {
2085 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002086 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002087 endinpos = startinpos + 1;
2088 if ((s[1] & 0xC0) == 0x80) {
2089 endinpos++;
2090 if ((s[2] & 0xC0) == 0x80)
2091 endinpos++;
2092 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002093 goto utf8Error;
2094 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002095 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002096 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2097 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2098
Fredrik Lundh8f455852001-06-27 18:59:43 +00002099#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002100 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002101#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002102 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002103
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002104 /* translate from 10000..10FFFF to 0..FFFF */
2105 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002106
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002107 /* high surrogate = top 10 bits added to D800 */
2108 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002109
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002110 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002111 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002112#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 }
2115 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002116 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002117
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002118 utf8Error:
2119 outpos = p-PyUnicode_AS_UNICODE(unicode);
2120 if (unicode_decode_call_errorhandler(
2121 errors, &errorHandler,
2122 "utf8", errmsg,
2123 starts, size, &startinpos, &endinpos, &exc, &s,
2124 &unicode, &outpos, &p))
2125 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 }
Walter Dörwald69652032004-09-07 20:24:22 +00002127 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002128 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129
2130 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002131 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 goto onError;
2133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002134 Py_XDECREF(errorHandler);
2135 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 return (PyObject *)unicode;
2137
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002138 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002139 Py_XDECREF(errorHandler);
2140 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 Py_DECREF(unicode);
2142 return NULL;
2143}
2144
Tim Peters602f7402002-04-27 18:03:26 +00002145/* Allocation strategy: if the string is short, convert into a stack buffer
2146 and allocate exactly as much space needed at the end. Else allocate the
2147 maximum possible needed (4 result bytes per Unicode character), and return
2148 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002149*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002150PyObject *
2151PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002152 Py_ssize_t size,
2153 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154{
Tim Peters602f7402002-04-27 18:03:26 +00002155#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002156
Martin v. Löwis18e16552006-02-15 17:27:45 +00002157 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002158 PyObject *v; /* result string object */
2159 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002160 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002161 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002162 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002163
Tim Peters602f7402002-04-27 18:03:26 +00002164 assert(s != NULL);
2165 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166
Tim Peters602f7402002-04-27 18:03:26 +00002167 if (size <= MAX_SHORT_UNICHARS) {
2168 /* Write into the stack buffer; nallocated can't overflow.
2169 * At the end, we'll allocate exactly as much heap space as it
2170 * turns out we need.
2171 */
2172 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2173 v = NULL; /* will allocate after we're done */
2174 p = stackbuf;
2175 }
2176 else {
2177 /* Overallocate on the heap, and give the excess back at the end. */
2178 nallocated = size * 4;
2179 if (nallocated / 4 != size) /* overflow! */
2180 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002181 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002182 if (v == NULL)
2183 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002184 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002185 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002186
Tim Peters602f7402002-04-27 18:03:26 +00002187 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002188 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002189
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002190 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002191 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002192 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002193
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002195 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002196 *p++ = (char)(0xc0 | (ch >> 6));
2197 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002198 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002199 else {
Tim Peters602f7402002-04-27 18:03:26 +00002200 /* Encode UCS2 Unicode ordinals */
2201 if (ch < 0x10000) {
2202 /* Special case: check for high surrogate */
2203 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2204 Py_UCS4 ch2 = s[i];
2205 /* Check for low surrogate and combine the two to
2206 form a UCS4 value */
2207 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002208 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002209 i++;
2210 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002211 }
Tim Peters602f7402002-04-27 18:03:26 +00002212 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002213 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002214 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002215 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2216 *p++ = (char)(0x80 | (ch & 0x3f));
2217 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002218 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002219 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002220 /* Encode UCS4 Unicode ordinals */
2221 *p++ = (char)(0xf0 | (ch >> 18));
2222 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2223 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2224 *p++ = (char)(0x80 | (ch & 0x3f));
2225 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002227
Tim Peters602f7402002-04-27 18:03:26 +00002228 if (v == NULL) {
2229 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002230 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002231 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002232 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002233 }
2234 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002235 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002236 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002237 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002238 if (_PyString_Resize(&v, nneeded))
2239 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002242
Tim Peters602f7402002-04-27 18:03:26 +00002243#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244}
2245
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2247{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 if (!PyUnicode_Check(unicode)) {
2249 PyErr_BadArgument();
2250 return NULL;
2251 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002252 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002253 PyUnicode_GET_SIZE(unicode),
2254 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255}
2256
Walter Dörwald6e390802007-08-17 16:41:28 +00002257/* --- UTF-32 Codec ------------------------------------------------------- */
2258
2259PyObject *
2260PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002261 Py_ssize_t size,
2262 const char *errors,
2263 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002264{
2265 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2266}
2267
2268PyObject *
2269PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002270 Py_ssize_t size,
2271 const char *errors,
2272 int *byteorder,
2273 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002274{
2275 const char *starts = s;
2276 Py_ssize_t startinpos;
2277 Py_ssize_t endinpos;
2278 Py_ssize_t outpos;
2279 PyUnicodeObject *unicode;
2280 Py_UNICODE *p;
2281#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002282 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002283 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002284#else
2285 const int pairs = 0;
2286#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002287 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002288 int bo = 0; /* assume native ordering by default */
2289 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002290 /* Offsets from q for retrieving bytes in the right order. */
2291#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2292 int iorder[] = {0, 1, 2, 3};
2293#else
2294 int iorder[] = {3, 2, 1, 0};
2295#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002296 PyObject *errorHandler = NULL;
2297 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002298
Walter Dörwald6e390802007-08-17 16:41:28 +00002299 q = (unsigned char *)s;
2300 e = q + size;
2301
2302 if (byteorder)
2303 bo = *byteorder;
2304
2305 /* Check for BOM marks (U+FEFF) in the input and adjust current
2306 byte order setting accordingly. In native mode, the leading BOM
2307 mark is skipped, in all other modes, it is copied to the output
2308 stream as-is (giving a ZWNBSP character). */
2309 if (bo == 0) {
2310 if (size >= 4) {
Benjamin Peterson89f676f2016-09-06 20:40:04 -07002311 const Py_UCS4 bom = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002312 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002313#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002314 if (bom == 0x0000FEFF) {
2315 q += 4;
2316 bo = -1;
2317 }
2318 else if (bom == 0xFFFE0000) {
2319 q += 4;
2320 bo = 1;
2321 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002322#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002323 if (bom == 0x0000FEFF) {
2324 q += 4;
2325 bo = 1;
2326 }
2327 else if (bom == 0xFFFE0000) {
2328 q += 4;
2329 bo = -1;
2330 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002331#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002332 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002333 }
2334
2335 if (bo == -1) {
2336 /* force LE */
2337 iorder[0] = 0;
2338 iorder[1] = 1;
2339 iorder[2] = 2;
2340 iorder[3] = 3;
2341 }
2342 else if (bo == 1) {
2343 /* force BE */
2344 iorder[0] = 3;
2345 iorder[1] = 2;
2346 iorder[2] = 1;
2347 iorder[3] = 0;
2348 }
2349
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002350 /* On narrow builds we split characters outside the BMP into two
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002351 code points => count how much extra space we need. */
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002352#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002353 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002354 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2355 pairs++;
2356#endif
2357
2358 /* This might be one to much, because of a BOM */
2359 unicode = _PyUnicode_New((size+3)/4+pairs);
2360 if (!unicode)
2361 return NULL;
2362 if (size == 0)
2363 return (PyObject *)unicode;
2364
2365 /* Unpack UTF-32 encoded data */
2366 p = unicode->str;
2367
Walter Dörwald6e390802007-08-17 16:41:28 +00002368 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002369 Py_UCS4 ch;
2370 /* remaining bytes at the end? (size should be divisible by 4) */
2371 if (e-q<4) {
2372 if (consumed)
2373 break;
2374 errmsg = "truncated data";
2375 startinpos = ((const char *)q)-starts;
2376 endinpos = ((const char *)e)-starts;
2377 goto utf32Error;
2378 /* The remaining input chars are ignored if the callback
2379 chooses to skip the input */
2380 }
Benjamin Peterson89f676f2016-09-06 20:40:04 -07002381 ch = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002382 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002383
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002384 if (ch >= 0x110000)
2385 {
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002386 errmsg = "code point not in range(0x110000)";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002387 startinpos = ((const char *)q)-starts;
2388 endinpos = startinpos+4;
2389 goto utf32Error;
2390 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002391#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002392 if (ch >= 0x10000)
2393 {
2394 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2395 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2396 }
2397 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002398#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002399 *p++ = ch;
2400 q += 4;
2401 continue;
2402 utf32Error:
2403 outpos = p-PyUnicode_AS_UNICODE(unicode);
2404 if (unicode_decode_call_errorhandler(
2405 errors, &errorHandler,
2406 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002407 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002408 &unicode, &outpos, &p))
2409 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002410 }
2411
2412 if (byteorder)
2413 *byteorder = bo;
2414
2415 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002416 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002417
2418 /* Adjust length */
2419 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2420 goto onError;
2421
2422 Py_XDECREF(errorHandler);
2423 Py_XDECREF(exc);
2424 return (PyObject *)unicode;
2425
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002426 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002427 Py_DECREF(unicode);
2428 Py_XDECREF(errorHandler);
2429 Py_XDECREF(exc);
2430 return NULL;
2431}
2432
2433PyObject *
2434PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002435 Py_ssize_t size,
2436 const char *errors,
2437 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002438{
2439 PyObject *v;
2440 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002441 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002442#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002443 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002444#else
2445 const int pairs = 0;
2446#endif
2447 /* Offsets from p for storing byte pairs in the right order. */
2448#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2449 int iorder[] = {0, 1, 2, 3};
2450#else
2451 int iorder[] = {3, 2, 1, 0};
2452#endif
2453
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002454#define STORECHAR(CH) \
2455 do { \
2456 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2457 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2458 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2459 p[iorder[0]] = (CH) & 0xff; \
2460 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002461 } while(0)
2462
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002463 /* In narrow builds we can output surrogate pairs as one code point,
Walter Dörwald6e390802007-08-17 16:41:28 +00002464 so we need less space. */
2465#ifndef Py_UNICODE_WIDE
2466 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002467 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2468 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2469 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002470#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002471 nsize = (size - pairs + (byteorder == 0));
2472 bytesize = nsize * 4;
2473 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002474 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002475 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002476 if (v == NULL)
2477 return NULL;
2478
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002479 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002480 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002481 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002482 if (size == 0)
2483 return v;
2484
2485 if (byteorder == -1) {
2486 /* force LE */
2487 iorder[0] = 0;
2488 iorder[1] = 1;
2489 iorder[2] = 2;
2490 iorder[3] = 3;
2491 }
2492 else if (byteorder == 1) {
2493 /* force BE */
2494 iorder[0] = 3;
2495 iorder[1] = 2;
2496 iorder[2] = 1;
2497 iorder[3] = 0;
2498 }
2499
2500 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002501 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002502#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002503 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2504 Py_UCS4 ch2 = *s;
2505 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2506 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2507 s++;
2508 size--;
2509 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002510 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002511#endif
2512 STORECHAR(ch);
2513 }
2514 return v;
2515#undef STORECHAR
2516}
2517
2518PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2519{
2520 if (!PyUnicode_Check(unicode)) {
2521 PyErr_BadArgument();
2522 return NULL;
2523 }
2524 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002525 PyUnicode_GET_SIZE(unicode),
2526 NULL,
2527 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002528}
2529
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530/* --- UTF-16 Codec ------------------------------------------------------- */
2531
Tim Peters772747b2001-08-09 22:21:55 +00002532PyObject *
2533PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002534 Py_ssize_t size,
2535 const char *errors,
2536 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537{
Walter Dörwald69652032004-09-07 20:24:22 +00002538 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2539}
2540
2541PyObject *
2542PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002543 Py_ssize_t size,
2544 const char *errors,
2545 int *byteorder,
2546 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002547{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002548 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002549 Py_ssize_t startinpos;
2550 Py_ssize_t endinpos;
2551 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552 PyUnicodeObject *unicode;
2553 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002554 const unsigned char *q, *e;
2555 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002556 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002557 /* Offsets from q for retrieving byte pairs in the right order. */
2558#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2559 int ihi = 1, ilo = 0;
2560#else
2561 int ihi = 0, ilo = 1;
2562#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 PyObject *errorHandler = NULL;
2564 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002565
2566 /* Note: size will always be longer than the resulting Unicode
2567 character count */
2568 unicode = _PyUnicode_New(size);
2569 if (!unicode)
2570 return NULL;
2571 if (size == 0)
2572 return (PyObject *)unicode;
2573
2574 /* Unpack UTF-16 encoded data */
2575 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002576 q = (unsigned char *)s;
2577 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578
2579 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002580 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002582 /* Check for BOM marks (U+FEFF) in the input and adjust current
2583 byte order setting accordingly. In native mode, the leading BOM
2584 mark is skipped, in all other modes, it is copied to the output
2585 stream as-is (giving a ZWNBSP character). */
2586 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002587 if (size >= 2) {
2588 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002589#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002590 if (bom == 0xFEFF) {
2591 q += 2;
2592 bo = -1;
2593 }
2594 else if (bom == 0xFFFE) {
2595 q += 2;
2596 bo = 1;
2597 }
Tim Petersced69f82003-09-16 20:30:58 +00002598#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002599 if (bom == 0xFEFF) {
2600 q += 2;
2601 bo = 1;
2602 }
2603 else if (bom == 0xFFFE) {
2604 q += 2;
2605 bo = -1;
2606 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002607#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002608 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610
Tim Peters772747b2001-08-09 22:21:55 +00002611 if (bo == -1) {
2612 /* force LE */
2613 ihi = 1;
2614 ilo = 0;
2615 }
2616 else if (bo == 1) {
2617 /* force BE */
2618 ihi = 0;
2619 ilo = 1;
2620 }
2621
2622 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002623 Py_UNICODE ch;
2624 /* remaining bytes at the end? (size should be even) */
2625 if (e-q<2) {
2626 if (consumed)
2627 break;
2628 errmsg = "truncated data";
2629 startinpos = ((const char *)q)-starts;
2630 endinpos = ((const char *)e)-starts;
2631 goto utf16Error;
2632 /* The remaining input chars are ignored if the callback
2633 chooses to skip the input */
2634 }
2635 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002636
Benjamin Peterson857ce152009-01-31 16:29:18 +00002637 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002638
2639 if (ch < 0xD800 || ch > 0xDFFF) {
2640 *p++ = ch;
2641 continue;
2642 }
2643
2644 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002645 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002646 q -= 2;
2647 if (consumed)
2648 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002649 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002650 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002651 endinpos = ((const char *)e)-starts;
2652 goto utf16Error;
2653 }
2654 if (0xD800 <= ch && ch <= 0xDBFF) {
2655 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2656 q += 2;
2657 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002658#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002659 *p++ = ch;
2660 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002661#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002662 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002663#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002664 continue;
2665 }
2666 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002667 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002668 startinpos = (((const char *)q)-4)-starts;
2669 endinpos = startinpos+2;
2670 goto utf16Error;
2671 }
2672
Benjamin Peterson857ce152009-01-31 16:29:18 +00002673 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002674 errmsg = "illegal encoding";
2675 startinpos = (((const char *)q)-2)-starts;
2676 endinpos = startinpos+2;
2677 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002678
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002679 utf16Error:
2680 outpos = p-PyUnicode_AS_UNICODE(unicode);
2681 if (unicode_decode_call_errorhandler(
2682 errors, &errorHandler,
2683 "utf16", errmsg,
2684 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2685 &unicode, &outpos, &p))
2686 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 }
2688
2689 if (byteorder)
2690 *byteorder = bo;
2691
Walter Dörwald69652032004-09-07 20:24:22 +00002692 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002693 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002694
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002696 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 goto onError;
2698
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002699 Py_XDECREF(errorHandler);
2700 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 return (PyObject *)unicode;
2702
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002703 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 Py_XDECREF(errorHandler);
2706 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 return NULL;
2708}
2709
Tim Peters772747b2001-08-09 22:21:55 +00002710PyObject *
2711PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002712 Py_ssize_t size,
2713 const char *errors,
2714 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715{
2716 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002717 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002718 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002719#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002720 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002721#else
2722 const int pairs = 0;
2723#endif
Tim Peters772747b2001-08-09 22:21:55 +00002724 /* Offsets from p for storing byte pairs in the right order. */
2725#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2726 int ihi = 1, ilo = 0;
2727#else
2728 int ihi = 0, ilo = 1;
2729#endif
2730
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002731#define STORECHAR(CH) \
2732 do { \
2733 p[ihi] = ((CH) >> 8) & 0xff; \
2734 p[ilo] = (CH) & 0xff; \
2735 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002736 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002738#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002739 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002740 if (s[i] >= 0x10000)
2741 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002742#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002743 /* 2 * (size + pairs + (byteorder == 0)) */
2744 if (size > PY_SSIZE_T_MAX ||
2745 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002746 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002747 nsize = size + pairs + (byteorder == 0);
2748 bytesize = nsize * 2;
2749 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002750 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002751 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 if (v == NULL)
2753 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002755 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002757 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002758 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002759 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002760
2761 if (byteorder == -1) {
2762 /* force LE */
2763 ihi = 1;
2764 ilo = 0;
2765 }
2766 else if (byteorder == 1) {
2767 /* force BE */
2768 ihi = 0;
2769 ilo = 1;
2770 }
2771
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002772 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002773 Py_UNICODE ch = *s++;
2774 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002775#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002776 if (ch >= 0x10000) {
2777 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2778 ch = 0xD800 | ((ch-0x10000) >> 10);
2779 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002780#endif
Tim Peters772747b2001-08-09 22:21:55 +00002781 STORECHAR(ch);
2782 if (ch2)
2783 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002786#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787}
2788
2789PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2790{
2791 if (!PyUnicode_Check(unicode)) {
2792 PyErr_BadArgument();
2793 return NULL;
2794 }
2795 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002796 PyUnicode_GET_SIZE(unicode),
2797 NULL,
2798 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799}
2800
2801/* --- Unicode Escape Codec ----------------------------------------------- */
2802
Fredrik Lundh06d12682001-01-24 07:59:11 +00002803static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002804
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002806 Py_ssize_t size,
2807 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002809 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002810 Py_ssize_t startinpos;
2811 Py_ssize_t endinpos;
2812 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002816 char* message;
2817 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002818 PyObject *errorHandler = NULL;
2819 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002820
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 /* Escaped strings will always be longer than the resulting
2822 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823 length after conversion to the true value.
2824 (but if the error callback returns a long replacement string
2825 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 v = _PyUnicode_New(size);
2827 if (v == NULL)
2828 goto onError;
2829 if (size == 0)
2830 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002832 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002834
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 while (s < end) {
2836 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002837 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839
2840 /* Non-escape characters are interpreted as Unicode ordinals */
2841 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002842 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 continue;
2844 }
2845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 /* \ - Escapes */
2848 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002849 c = *s++;
2850 if (s > end)
2851 c = '\0'; /* Invalid after \ */
2852 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002854 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 case '\n': break;
2856 case '\\': *p++ = '\\'; break;
2857 case '\'': *p++ = '\''; break;
2858 case '\"': *p++ = '\"'; break;
2859 case 'b': *p++ = '\b'; break;
2860 case 'f': *p++ = '\014'; break; /* FF */
2861 case 't': *p++ = '\t'; break;
2862 case 'n': *p++ = '\n'; break;
2863 case 'r': *p++ = '\r'; break;
2864 case 'v': *p++ = '\013'; break; /* VT */
2865 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2866
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002867 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 case '0': case '1': case '2': case '3':
2869 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002870 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002871 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002872 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002873 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002874 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002876 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 break;
2878
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002879 /* hex escapes */
2880 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002882 digits = 2;
2883 message = "truncated \\xXX escape";
2884 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002886 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002888 digits = 4;
2889 message = "truncated \\uXXXX escape";
2890 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002892 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002893 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002894 digits = 8;
2895 message = "truncated \\UXXXXXXXX escape";
2896 hexescape:
2897 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002898 if (end - s < digits) {
2899 /* count only hex digits */
2900 for (; s < end; ++s) {
2901 c = (unsigned char)*s;
2902 if (!Py_ISXDIGIT(c))
2903 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002904 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002905 goto error;
2906 }
2907 for (; digits--; ++s) {
2908 c = (unsigned char)*s;
2909 if (!Py_ISXDIGIT(c))
2910 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002911 chr = (chr<<4) & ~0xF;
2912 if (c >= '0' && c <= '9')
2913 chr += c - '0';
2914 else if (c >= 'a' && c <= 'f')
2915 chr += 10 + c - 'a';
2916 else
2917 chr += 10 + c - 'A';
2918 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002919 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002920 /* _decoding_error will have already written into the
2921 target buffer. */
2922 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002923 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002924 /* when we get here, chr is a 32-bit unicode character */
2925 if (chr <= 0xffff)
2926 /* UCS-2 character */
2927 *p++ = (Py_UNICODE) chr;
2928 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002929 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002930 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002931#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002932 *p++ = chr;
2933#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002934 chr -= 0x10000L;
2935 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002936 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002937#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002938 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002939 message = "illegal Unicode character";
2940 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002941 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002942 break;
2943
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002944 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002945 case 'N':
2946 message = "malformed \\N character escape";
2947 if (ucnhash_CAPI == NULL) {
2948 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002949 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002950 if (ucnhash_CAPI == NULL)
2951 goto ucnhashError;
2952 }
2953 if (*s == '{') {
2954 const char *start = s+1;
2955 /* look for the closing brace */
2956 while (*s != '}' && s < end)
2957 s++;
2958 if (s > start && s < end && *s == '}') {
2959 /* found a name. look it up in the unicode database */
2960 message = "unknown Unicode character name";
2961 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002962 if (s - start - 1 <= INT_MAX &&
2963 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002964 goto store;
2965 }
2966 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002967 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002968
2969 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002970 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002971 message = "\\ at end of string";
2972 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002973 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002974 }
2975 else {
2976 *p++ = '\\';
2977 *p++ = (unsigned char)s[-1];
2978 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002979 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002981 continue;
2982
2983 error:
2984 endinpos = s-starts;
2985 outpos = p-PyUnicode_AS_UNICODE(v);
2986 if (unicode_decode_call_errorhandler(
2987 errors, &errorHandler,
2988 "unicodeescape", message,
2989 starts, size, &startinpos, &endinpos, &exc, &s,
2990 &v, &outpos, &p))
2991 goto onError;
2992 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002994 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002995 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002996 Py_XDECREF(errorHandler);
2997 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002999
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003000 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003001 PyErr_SetString(
3002 PyExc_UnicodeError,
3003 "\\N escapes not supported (can't load unicodedata module)"
3004 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003005 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003006 Py_XDECREF(errorHandler);
3007 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003008 return NULL;
3009
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003010 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003012 Py_XDECREF(errorHandler);
3013 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014 return NULL;
3015}
3016
3017/* Return a Unicode-Escape string version of the Unicode object.
3018
3019 If quotes is true, the string is enclosed in u"" or u'' quotes as
3020 appropriate.
3021
3022*/
3023
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00003024Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003025 Py_ssize_t size,
3026 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00003027{
3028 /* like wcschr, but doesn't stop at NULL characters */
3029
3030 while (size-- > 0) {
3031 if (*s == ch)
3032 return s;
3033 s++;
3034 }
3035
3036 return NULL;
3037}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003038
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039static
3040PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003041 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 int quotes)
3043{
3044 PyObject *repr;
3045 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003047 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00003048#ifdef Py_UNICODE_WIDE
3049 const Py_ssize_t expandsize = 10;
3050#else
3051 const Py_ssize_t expandsize = 6;
3052#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053
Neal Norwitz17753ec2006-08-21 22:21:19 +00003054 /* XXX(nnorwitz): rather than over-allocating, it would be
3055 better to choose a different scheme. Perhaps scan the
3056 first N-chars of the string and allocate based on that size.
3057 */
3058 /* Initial allocation is based on the longest-possible unichr
3059 escape.
3060
3061 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3062 unichr, so in this case it's the longest unichr escape. In
3063 narrow (UTF-16) builds this is five chars per source unichr
3064 since there are two unichrs in the surrogate pair, so in narrow
3065 (UTF-16) builds it's not the longest unichr escape.
3066
3067 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3068 so in the narrow (UTF-16) build case it's the longest unichr
3069 escape.
3070 */
3071
Neal Norwitze7d8be82008-07-31 17:17:14 +00003072 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003073 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003074
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003075 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003076 2
3077 + expandsize*size
3078 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 if (repr == NULL)
3080 return NULL;
3081
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003082 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083
3084 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003086 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 !findchar(s, size, '"')) ? '"' : '\'';
3088 }
3089 while (size-- > 0) {
3090 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003091
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003092 /* Escape quotes and backslashes */
3093 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003094 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095 *p++ = '\\';
3096 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003097 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003098 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003099
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003100#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003101 /* Map 21-bit characters to '\U00xxxxxx' */
3102 else if (ch >= 0x10000) {
3103 *p++ = '\\';
3104 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003105 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3106 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3107 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3108 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3109 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3110 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3111 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003112 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003113 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003114 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003115#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003116 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3117 else if (ch >= 0xD800 && ch < 0xDC00) {
3118 Py_UNICODE ch2;
3119 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003120
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003121 ch2 = *s++;
3122 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003123 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003124 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3125 *p++ = '\\';
3126 *p++ = 'U';
3127 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3128 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3129 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3130 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3131 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3132 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3133 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3134 *p++ = hexdigit[ucs & 0x0000000F];
3135 continue;
3136 }
3137 /* Fall through: isolated surrogates are copied as-is */
3138 s--;
3139 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003140 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003141#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003142
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003144 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145 *p++ = '\\';
3146 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003147 *p++ = hexdigit[(ch >> 12) & 0x000F];
3148 *p++ = hexdigit[(ch >> 8) & 0x000F];
3149 *p++ = hexdigit[(ch >> 4) & 0x000F];
3150 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003152
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003153 /* Map special whitespace to '\t', \n', '\r' */
3154 else if (ch == '\t') {
3155 *p++ = '\\';
3156 *p++ = 't';
3157 }
3158 else if (ch == '\n') {
3159 *p++ = '\\';
3160 *p++ = 'n';
3161 }
3162 else if (ch == '\r') {
3163 *p++ = '\\';
3164 *p++ = 'r';
3165 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003166
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003167 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003168 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003170 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003171 *p++ = hexdigit[(ch >> 4) & 0x000F];
3172 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003173 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003174
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 /* Copy everything else as-is */
3176 else
3177 *p++ = (char) ch;
3178 }
3179 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003180 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181
3182 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003183 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185 return repr;
3186}
3187
3188PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003189 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003190{
3191 return unicodeescape_string(s, size, 0);
3192}
3193
3194PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3195{
3196 if (!PyUnicode_Check(unicode)) {
3197 PyErr_BadArgument();
3198 return NULL;
3199 }
3200 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003201 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202}
3203
3204/* --- Raw Unicode Escape Codec ------------------------------------------- */
3205
3206PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003207 Py_ssize_t size,
3208 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003210 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003211 Py_ssize_t startinpos;
3212 Py_ssize_t endinpos;
3213 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003215 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 const char *end;
3217 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 PyObject *errorHandler = NULL;
3219 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003220
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 /* Escaped strings will always be longer than the resulting
3222 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003223 length after conversion to the true value. (But decoding error
3224 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 v = _PyUnicode_New(size);
3226 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003227 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003229 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003230 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 end = s + size;
3232 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003233 unsigned char c;
3234 Py_UCS4 x;
3235 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003236 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003238 /* Non-escape characters are interpreted as Unicode ordinals */
3239 if (*s != '\\') {
3240 *p++ = (unsigned char)*s++;
3241 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003242 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003243 startinpos = s-starts;
3244
3245 /* \u-escapes are only interpreted iff the number of leading
3246 backslashes if odd */
3247 bs = s;
3248 for (;s < end;) {
3249 if (*s != '\\')
3250 break;
3251 *p++ = (unsigned char)*s++;
3252 }
3253 if (((s - bs) & 1) == 0 ||
3254 s >= end ||
3255 (*s != 'u' && *s != 'U')) {
3256 continue;
3257 }
3258 p--;
3259 count = *s=='u' ? 4 : 8;
3260 s++;
3261
3262 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3263 outpos = p-PyUnicode_AS_UNICODE(v);
3264 for (x = 0, i = 0; i < count; ++i, ++s) {
3265 c = (unsigned char)*s;
3266 if (!isxdigit(c)) {
3267 endinpos = s-starts;
3268 if (unicode_decode_call_errorhandler(
3269 errors, &errorHandler,
3270 "rawunicodeescape", "truncated \\uXXXX",
3271 starts, size, &startinpos, &endinpos, &exc, &s,
3272 &v, &outpos, &p))
3273 goto onError;
3274 goto nextByte;
3275 }
3276 x = (x<<4) & ~0xF;
3277 if (c >= '0' && c <= '9')
3278 x += c - '0';
3279 else if (c >= 'a' && c <= 'f')
3280 x += 10 + c - 'a';
3281 else
3282 x += 10 + c - 'A';
3283 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003284 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003285 /* UCS-2 character */
3286 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003287 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003288 /* UCS-4 character. Either store directly, or as
3289 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003290#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003291 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003292#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003293 x -= 0x10000L;
3294 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3295 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003296#endif
3297 } else {
3298 endinpos = s-starts;
3299 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003300 if (unicode_decode_call_errorhandler(
3301 errors, &errorHandler,
3302 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003303 starts, size, &startinpos, &endinpos, &exc, &s,
3304 &v, &outpos, &p))
3305 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003306 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003307 nextByte:
3308 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003310 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003311 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312 Py_XDECREF(errorHandler);
3313 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003315
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003316 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318 Py_XDECREF(errorHandler);
3319 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 return NULL;
3321}
3322
3323PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003324 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325{
3326 PyObject *repr;
3327 char *p;
3328 char *q;
3329
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003330 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003331#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003332 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003333#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003334 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003335#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003336
Neal Norwitze7d8be82008-07-31 17:17:14 +00003337 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003338 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003339
Neal Norwitze7d8be82008-07-31 17:17:14 +00003340 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 if (repr == NULL)
3342 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003343 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003344 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003346 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 while (size-- > 0) {
3348 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003349#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003350 /* Map 32-bit characters to '\Uxxxxxxxx' */
3351 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003352 *p++ = '\\';
3353 *p++ = 'U';
3354 *p++ = hexdigit[(ch >> 28) & 0xf];
3355 *p++ = hexdigit[(ch >> 24) & 0xf];
3356 *p++ = hexdigit[(ch >> 20) & 0xf];
3357 *p++ = hexdigit[(ch >> 16) & 0xf];
3358 *p++ = hexdigit[(ch >> 12) & 0xf];
3359 *p++ = hexdigit[(ch >> 8) & 0xf];
3360 *p++ = hexdigit[(ch >> 4) & 0xf];
3361 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003362 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003363 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003364#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003365 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3366 if (ch >= 0xD800 && ch < 0xDC00) {
3367 Py_UNICODE ch2;
3368 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003369
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003370 ch2 = *s++;
3371 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003372 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003373 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3374 *p++ = '\\';
3375 *p++ = 'U';
3376 *p++ = hexdigit[(ucs >> 28) & 0xf];
3377 *p++ = hexdigit[(ucs >> 24) & 0xf];
3378 *p++ = hexdigit[(ucs >> 20) & 0xf];
3379 *p++ = hexdigit[(ucs >> 16) & 0xf];
3380 *p++ = hexdigit[(ucs >> 12) & 0xf];
3381 *p++ = hexdigit[(ucs >> 8) & 0xf];
3382 *p++ = hexdigit[(ucs >> 4) & 0xf];
3383 *p++ = hexdigit[ucs & 0xf];
3384 continue;
3385 }
3386 /* Fall through: isolated surrogates are copied as-is */
3387 s--;
3388 size++;
3389 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003390#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003391 /* Map 16-bit characters to '\uxxxx' */
3392 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 *p++ = '\\';
3394 *p++ = 'u';
3395 *p++ = hexdigit[(ch >> 12) & 0xf];
3396 *p++ = hexdigit[(ch >> 8) & 0xf];
3397 *p++ = hexdigit[(ch >> 4) & 0xf];
3398 *p++ = hexdigit[ch & 15];
3399 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003400 /* Copy everything else as-is */
3401 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402 *p++ = (char) ch;
3403 }
3404 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003405 if (_PyString_Resize(&repr, p - q))
3406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 return repr;
3408}
3409
3410PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3411{
3412 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003413 PyErr_BadArgument();
3414 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415 }
3416 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003417 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418}
3419
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003420/* --- Unicode Internal Codec ------------------------------------------- */
3421
3422PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003423 Py_ssize_t size,
3424 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003425{
3426 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003427 Py_ssize_t startinpos;
3428 Py_ssize_t endinpos;
3429 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003430 PyUnicodeObject *v;
3431 Py_UNICODE *p;
3432 const char *end;
3433 const char *reason;
3434 PyObject *errorHandler = NULL;
3435 PyObject *exc = NULL;
3436
Neal Norwitzd43069c2006-01-08 01:12:10 +00003437#ifdef Py_UNICODE_WIDE
3438 Py_UNICODE unimax = PyUnicode_GetMax();
3439#endif
3440
Armin Rigo7ccbca92006-10-04 12:17:45 +00003441 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003442 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3443 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003444 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003445 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003446 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003447 p = PyUnicode_AS_UNICODE(v);
3448 end = s + size;
3449
3450 while (s < end) {
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003451 if (end-s < Py_UNICODE_SIZE) {
3452 endinpos = end-starts;
3453 reason = "truncated input";
3454 goto error;
3455 }
Neal Norwitz1004a532006-05-15 07:17:23 +00003456 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003457#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003458 /* We have to sanity check the raw data, otherwise doom looms for
3459 some malformed UCS-4 data. */
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003460 if (*p > unimax || *p < 0) {
3461 endinpos = s - starts + Py_UNICODE_SIZE;
3462 reason = "illegal code point (> 0x10FFFF)";
3463 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003464 }
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003465#endif
3466 p++;
3467 s += Py_UNICODE_SIZE;
3468 continue;
3469
3470 error:
3471 startinpos = s - starts;
3472 outpos = p - PyUnicode_AS_UNICODE(v);
3473 if (unicode_decode_call_errorhandler(
3474 errors, &errorHandler,
3475 "unicode_internal", reason,
3476 starts, size, &startinpos, &endinpos, &exc, &s,
3477 &v, &outpos, &p)) {
3478 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003479 }
3480 }
3481
Martin v. Löwis412fb672006-04-13 06:34:32 +00003482 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003483 goto onError;
3484 Py_XDECREF(errorHandler);
3485 Py_XDECREF(exc);
3486 return (PyObject *)v;
3487
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003488 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003489 Py_XDECREF(v);
3490 Py_XDECREF(errorHandler);
3491 Py_XDECREF(exc);
3492 return NULL;
3493}
3494
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495/* --- Latin-1 Codec ------------------------------------------------------ */
3496
3497PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003498 Py_ssize_t size,
3499 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500{
3501 PyUnicodeObject *v;
3502 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003503
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003505 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003506 Py_UNICODE r = *(unsigned char*)s;
3507 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003508 }
3509
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 v = _PyUnicode_New(size);
3511 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003512 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003514 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 p = PyUnicode_AS_UNICODE(v);
3516 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003517 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003519
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003520 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521 Py_XDECREF(v);
3522 return NULL;
3523}
3524
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525/* create or adjust a UnicodeEncodeError */
3526static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003527 const char *encoding,
3528 const Py_UNICODE *unicode, Py_ssize_t size,
3529 Py_ssize_t startpos, Py_ssize_t endpos,
3530 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003533 *exceptionObject = PyUnicodeEncodeError_Create(
3534 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 }
3536 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003537 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3538 goto onError;
3539 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3540 goto onError;
3541 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3542 goto onError;
3543 return;
3544 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02003545 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546 }
3547}
3548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549/* raises a UnicodeEncodeError */
3550static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003551 const char *encoding,
3552 const Py_UNICODE *unicode, Py_ssize_t size,
3553 Py_ssize_t startpos, Py_ssize_t endpos,
3554 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555{
3556 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003557 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003559 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560}
3561
3562/* error handling callback helper:
3563 build arguments, call the callback and check the arguments,
3564 put the result into newpos and return the replacement string, which
3565 has to be freed by the caller */
3566static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003567 PyObject **errorHandler,
3568 const char *encoding, const char *reason,
3569 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3570 Py_ssize_t startpos, Py_ssize_t endpos,
3571 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003573 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574
3575 PyObject *restuple;
3576 PyObject *resunicode;
3577
3578 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003579 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003581 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 }
3583
3584 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003585 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003587 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588
3589 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003590 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003592 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003594 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003595 Py_DECREF(restuple);
3596 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 }
3598 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003599 &resunicode, newpos)) {
3600 Py_DECREF(restuple);
3601 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 }
3603 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003604 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003605 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003606 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3607 Py_DECREF(restuple);
3608 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003609 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 Py_INCREF(resunicode);
3611 Py_DECREF(restuple);
3612 return resunicode;
3613}
3614
3615static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003616 Py_ssize_t size,
3617 const char *errors,
3618 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619{
3620 /* output object */
3621 PyObject *res;
3622 /* pointers to the beginning and end+1 of input */
3623 const Py_UNICODE *startp = p;
3624 const Py_UNICODE *endp = p + size;
3625 /* pointer to the beginning of the unencodable characters */
3626 /* const Py_UNICODE *badp = NULL; */
3627 /* pointer into the output */
3628 char *str;
3629 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003630 Py_ssize_t respos = 0;
3631 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003632 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3633 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 PyObject *errorHandler = NULL;
3635 PyObject *exc = NULL;
3636 /* the following variable is used for caching string comparisons
3637 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3638 int known_errorHandler = -1;
3639
3640 /* allocate enough for a simple encoding without
3641 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003642 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 if (res == NULL)
3644 goto onError;
3645 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003646 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003647 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 ressize = size;
3649
3650 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003651 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003653 /* can we encode this? */
3654 if (c<limit) {
3655 /* no overflow check, because we know that the space is enough */
3656 *str++ = (char)c;
3657 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003658 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003659 else {
3660 Py_ssize_t unicodepos = p-startp;
3661 Py_ssize_t requiredsize;
3662 PyObject *repunicode;
3663 Py_ssize_t repsize;
3664 Py_ssize_t newpos;
3665 Py_ssize_t respos;
3666 Py_UNICODE *uni2;
3667 /* startpos for collecting unencodable chars */
3668 const Py_UNICODE *collstart = p;
3669 const Py_UNICODE *collend = p;
3670 /* find all unecodable characters */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003671 while ((collend < endp) && ((*collend) >= limit))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003672 ++collend;
3673 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3674 if (known_errorHandler==-1) {
3675 if ((errors==NULL) || (!strcmp(errors, "strict")))
3676 known_errorHandler = 1;
3677 else if (!strcmp(errors, "replace"))
3678 known_errorHandler = 2;
3679 else if (!strcmp(errors, "ignore"))
3680 known_errorHandler = 3;
3681 else if (!strcmp(errors, "xmlcharrefreplace"))
3682 known_errorHandler = 4;
3683 else
3684 known_errorHandler = 0;
3685 }
3686 switch (known_errorHandler) {
3687 case 1: /* strict */
3688 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3689 goto onError;
3690 case 2: /* replace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003691 while (collstart++ < collend)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003692 *str++ = '?'; /* fall through */
3693 case 3: /* ignore */
3694 p = collend;
3695 break;
3696 case 4: /* xmlcharrefreplace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003697 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003698 /* determine replacement size (temporarily (mis)uses p) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003699 requiredsize = respos;
3700 for (p = collstart; p < collend;) {
Serhiy Storchakae822b032013-08-06 16:56:26 +03003701 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003702 Py_ssize_t incr;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003703 if (ch < 10)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003704 incr = 2+1+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003705 else if (ch < 100)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003706 incr = 2+2+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003707 else if (ch < 1000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003708 incr = 2+3+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003709 else if (ch < 10000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003710 incr = 2+4+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003711 else if (ch < 100000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003712 incr = 2+5+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003713 else if (ch < 1000000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003714 incr = 2+6+1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003715 else
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003716 incr = 2+7+1;
3717 if (requiredsize > PY_SSIZE_T_MAX - incr)
3718 goto overflow;
3719 requiredsize += incr;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003720 }
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003721 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3722 goto overflow;
3723 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003724 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003725 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003726 requiredsize = 2*ressize;
3727 if (_PyString_Resize(&res, requiredsize))
3728 goto onError;
3729 str = PyString_AS_STRING(res) + respos;
3730 ressize = requiredsize;
3731 }
3732 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03003733 for (p = collstart; p < collend;) {
3734 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3735 str += sprintf(str, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003736 }
3737 p = collend;
3738 break;
3739 default:
3740 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3741 encoding, reason, startp, size, &exc,
3742 collstart-startp, collend-startp, &newpos);
3743 if (repunicode == NULL)
3744 goto onError;
3745 /* need more space? (at least enough for what we have+the
3746 replacement+the rest of the string, so we won't have to
3747 check space for encodable characters) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003748 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003749 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003750 if (respos > PY_SSIZE_T_MAX - repsize)
3751 goto overflow;
3752 requiredsize = respos + repsize;
3753 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3754 goto overflow;
3755 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003756 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003757 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003758 requiredsize = 2*ressize;
3759 if (_PyString_Resize(&res, requiredsize)) {
3760 Py_DECREF(repunicode);
3761 goto onError;
3762 }
3763 str = PyString_AS_STRING(res) + respos;
3764 ressize = requiredsize;
3765 }
3766 /* check if there is anything unencodable in the replacement
3767 and copy it to the output */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003768 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003769 c = *uni2;
3770 if (c >= limit) {
3771 raise_encode_exception(&exc, encoding, startp, size,
3772 unicodepos, unicodepos+1, reason);
3773 Py_DECREF(repunicode);
3774 goto onError;
3775 }
3776 *str = (char)c;
3777 }
3778 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003779 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003780 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003781 }
3782 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 /* Resize if we allocated to much */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003784 respos = str - PyString_AS_STRING(res);
3785 if (respos < ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003786 /* If this falls res will be NULL */
3787 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 Py_XDECREF(errorHandler);
3789 Py_XDECREF(exc);
3790 return res;
3791
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003792 overflow:
3793 PyErr_SetString(PyExc_OverflowError,
3794 "encoded result is too long for a Python string");
3795
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003796 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003797 Py_XDECREF(res);
3798 Py_XDECREF(errorHandler);
3799 Py_XDECREF(exc);
3800 return NULL;
3801}
3802
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003804 Py_ssize_t size,
3805 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003807 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808}
3809
3810PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3811{
3812 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003813 PyErr_BadArgument();
3814 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 }
3816 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003817 PyUnicode_GET_SIZE(unicode),
3818 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819}
3820
3821/* --- 7-bit ASCII Codec -------------------------------------------------- */
3822
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003824 Py_ssize_t size,
3825 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003827 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 PyUnicodeObject *v;
3829 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003830 Py_ssize_t startinpos;
3831 Py_ssize_t endinpos;
3832 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003833 const char *e;
3834 PyObject *errorHandler = NULL;
3835 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003836
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003838 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003839 Py_UNICODE r = *(unsigned char*)s;
3840 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003841 }
Tim Petersced69f82003-09-16 20:30:58 +00003842
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 v = _PyUnicode_New(size);
3844 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003845 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003847 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003849 e = s + size;
3850 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003851 register unsigned char c = (unsigned char)*s;
3852 if (c < 128) {
3853 *p++ = c;
3854 ++s;
3855 }
3856 else {
3857 startinpos = s-starts;
3858 endinpos = startinpos + 1;
3859 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3860 if (unicode_decode_call_errorhandler(
3861 errors, &errorHandler,
3862 "ascii", "ordinal not in range(128)",
3863 starts, size, &startinpos, &endinpos, &exc, &s,
3864 &v, &outpos, &p))
3865 goto onError;
3866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003868 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003869 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3870 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003871 Py_XDECREF(errorHandler);
3872 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003873 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003874
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003875 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 Py_XDECREF(errorHandler);
3878 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879 return NULL;
3880}
3881
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003883 Py_ssize_t size,
3884 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003886 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887}
3888
3889PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3890{
3891 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003892 PyErr_BadArgument();
3893 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 }
3895 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003896 PyUnicode_GET_SIZE(unicode),
3897 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898}
3899
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003900#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003901
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003902/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003903
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003904#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003905#define NEED_RETRY
3906#endif
3907
3908/* XXX This code is limited to "true" double-byte encodings, as
3909 a) it assumes an incomplete character consists of a single byte, and
3910 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003911 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003912
3913static int is_dbcs_lead_byte(const char *s, int offset)
3914{
3915 const char *curr = s + offset;
3916
3917 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003918 const char *prev = CharPrev(s, curr);
3919 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003920 }
3921 return 0;
3922}
3923
3924/*
3925 * Decode MBCS string into unicode object. If 'final' is set, converts
3926 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3927 */
3928static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003929 const char *s, /* MBCS string */
3930 int size, /* sizeof MBCS string */
3931 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003932{
3933 Py_UNICODE *p;
3934 Py_ssize_t n = 0;
3935 int usize = 0;
3936
3937 assert(size >= 0);
3938
3939 /* Skip trailing lead-byte unless 'final' is set */
3940 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003941 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003942
3943 /* First get the size of the result */
3944 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003945 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3946 if (usize == 0) {
3947 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3948 return -1;
3949 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003950 }
3951
3952 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003953 /* Create unicode object */
3954 *v = _PyUnicode_New(usize);
3955 if (*v == NULL)
3956 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003957 }
3958 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003959 /* Extend unicode object */
3960 n = PyUnicode_GET_SIZE(*v);
3961 if (_PyUnicode_Resize(v, n + usize) < 0)
3962 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003963 }
3964
3965 /* Do the conversion */
3966 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003967 p = PyUnicode_AS_UNICODE(*v) + n;
3968 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3969 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3970 return -1;
3971 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003972 }
3973
3974 return size;
3975}
3976
3977PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003978 Py_ssize_t size,
3979 const char *errors,
3980 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003981{
3982 PyUnicodeObject *v = NULL;
3983 int done;
3984
3985 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003986 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003987
3988#ifdef NEED_RETRY
3989 retry:
3990 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003991 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003992 else
3993#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003994 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003995
3996 if (done < 0) {
3997 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003998 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003999 }
4000
4001 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004002 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004003
4004#ifdef NEED_RETRY
4005 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004006 s += done;
4007 size -= done;
4008 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004009 }
4010#endif
4011
4012 return (PyObject *)v;
4013}
4014
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004015PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004016 Py_ssize_t size,
4017 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004018{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004019 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4020}
4021
4022/*
4023 * Convert unicode into string object (MBCS).
4024 * Returns 0 if succeed, -1 otherwise.
4025 */
4026static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004027 const Py_UNICODE *p, /* unicode */
4028 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004029{
4030 int mbcssize = 0;
4031 Py_ssize_t n = 0;
4032
4033 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004034
4035 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004036 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004037 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4038 if (mbcssize == 0) {
4039 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4040 return -1;
4041 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004042 }
4043
Martin v. Löwisd8251432006-06-14 05:21:04 +00004044 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004045 /* Create string object */
4046 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4047 if (*repr == NULL)
4048 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004049 }
4050 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004051 /* Extend string object */
4052 n = PyString_Size(*repr);
4053 if (_PyString_Resize(repr, n + mbcssize) < 0)
4054 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004055 }
4056
4057 /* Do the conversion */
4058 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004059 char *s = PyString_AS_STRING(*repr) + n;
4060 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4061 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4062 return -1;
4063 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004064 }
4065
4066 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004067}
4068
4069PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004070 Py_ssize_t size,
4071 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004072{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004073 PyObject *repr = NULL;
4074 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004075
Martin v. Löwisd8251432006-06-14 05:21:04 +00004076#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004077 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004078 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004079 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004080 else
4081#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004082 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004083
Martin v. Löwisd8251432006-06-14 05:21:04 +00004084 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004085 Py_XDECREF(repr);
4086 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004087 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004088
4089#ifdef NEED_RETRY
4090 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004091 p += INT_MAX;
4092 size -= INT_MAX;
4093 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004094 }
4095#endif
4096
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004097 return repr;
4098}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004099
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004100PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4101{
4102 if (!PyUnicode_Check(unicode)) {
4103 PyErr_BadArgument();
4104 return NULL;
4105 }
4106 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004107 PyUnicode_GET_SIZE(unicode),
4108 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004109}
4110
Martin v. Löwisd8251432006-06-14 05:21:04 +00004111#undef NEED_RETRY
4112
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004113#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004114
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115/* --- Character Mapping Codec -------------------------------------------- */
4116
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004118 Py_ssize_t size,
4119 PyObject *mapping,
4120 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004123 Py_ssize_t startinpos;
4124 Py_ssize_t endinpos;
4125 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 PyUnicodeObject *v;
4128 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004129 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 PyObject *errorHandler = NULL;
4131 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004132 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004133 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004134
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 /* Default to Latin-1 */
4136 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004137 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138
4139 v = _PyUnicode_New(size);
4140 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004141 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004143 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004146 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004147 mapstring = PyUnicode_AS_UNICODE(mapping);
4148 maplen = PyUnicode_GET_SIZE(mapping);
4149 while (s < e) {
4150 unsigned char ch = *s;
4151 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004153 if (ch < maplen)
4154 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004156 if (x == 0xfffe) {
4157 /* undefined mapping */
4158 outpos = p-PyUnicode_AS_UNICODE(v);
4159 startinpos = s-starts;
4160 endinpos = startinpos+1;
4161 if (unicode_decode_call_errorhandler(
4162 errors, &errorHandler,
4163 "charmap", "character maps to <undefined>",
4164 starts, size, &startinpos, &endinpos, &exc, &s,
4165 &v, &outpos, &p)) {
4166 goto onError;
4167 }
4168 continue;
4169 }
4170 *p++ = x;
4171 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004172 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004173 }
4174 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004175 while (s < e) {
4176 unsigned char ch = *s;
4177 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004178
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004179 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4180 w = PyInt_FromLong((long)ch);
4181 if (w == NULL)
4182 goto onError;
4183 x = PyObject_GetItem(mapping, w);
4184 Py_DECREF(w);
4185 if (x == NULL) {
4186 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4187 /* No mapping found means: mapping is undefined. */
4188 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004189 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004190 } else
4191 goto onError;
4192 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004193
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004194 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004195 if (x == Py_None)
4196 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004197 if (PyInt_Check(x)) {
4198 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004199 if (value == 0xFFFE)
4200 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004201 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004202 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004203 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004204 Py_DECREF(x);
4205 goto onError;
4206 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004207
4208#ifndef Py_UNICODE_WIDE
4209 if (value > 0xFFFF) {
4210 /* see the code for 1-n mapping below */
4211 if (extrachars < 2) {
4212 /* resize first */
4213 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4214 Py_ssize_t needed = 10 - extrachars;
4215 extrachars += needed;
4216 /* XXX overflow detection missing */
4217 if (_PyUnicode_Resize(&v,
4218 PyUnicode_GET_SIZE(v) + needed) < 0) {
4219 Py_DECREF(x);
4220 goto onError;
4221 }
4222 p = PyUnicode_AS_UNICODE(v) + oldpos;
4223 }
4224 value -= 0x10000;
4225 *p++ = 0xD800 | (value >> 10);
4226 *p++ = 0xDC00 | (value & 0x3FF);
4227 extrachars -= 2;
4228 }
4229 else
4230#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004231 *p++ = (Py_UNICODE)value;
4232 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004233 else if (PyUnicode_Check(x)) {
4234 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004235
Serhiy Storchaka95997452013-01-15 14:42:59 +02004236 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004237 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004238 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4239 if (value == 0xFFFE)
4240 goto Undefined;
4241 *p++ = value;
4242 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004243 else if (targetsize > 1) {
4244 /* 1-n mapping */
4245 if (targetsize > extrachars) {
4246 /* resize first */
4247 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4248 Py_ssize_t needed = (targetsize - extrachars) + \
4249 (targetsize << 2);
4250 extrachars += needed;
4251 /* XXX overflow detection missing */
4252 if (_PyUnicode_Resize(&v,
4253 PyUnicode_GET_SIZE(v) + needed) < 0) {
4254 Py_DECREF(x);
4255 goto onError;
4256 }
4257 p = PyUnicode_AS_UNICODE(v) + oldpos;
4258 }
4259 Py_UNICODE_COPY(p,
4260 PyUnicode_AS_UNICODE(x),
4261 targetsize);
4262 p += targetsize;
4263 extrachars -= targetsize;
4264 }
4265 /* 1-0 mapping: skip the character */
4266 }
4267 else {
4268 /* wrong return value */
4269 PyErr_SetString(PyExc_TypeError,
4270 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004271 Py_DECREF(x);
4272 goto onError;
4273 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004274 Py_DECREF(x);
4275 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004276 continue;
4277Undefined:
4278 /* undefined mapping */
4279 Py_XDECREF(x);
4280 outpos = p-PyUnicode_AS_UNICODE(v);
4281 startinpos = s-starts;
4282 endinpos = startinpos+1;
4283 if (unicode_decode_call_errorhandler(
4284 errors, &errorHandler,
4285 "charmap", "character maps to <undefined>",
4286 starts, size, &startinpos, &endinpos, &exc, &s,
4287 &v, &outpos, &p)) {
4288 goto onError;
4289 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 }
4292 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004293 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4294 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 Py_XDECREF(errorHandler);
4296 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004298
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004299 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 Py_XDECREF(errorHandler);
4301 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302 Py_XDECREF(v);
4303 return NULL;
4304}
4305
Martin v. Löwis3f767792006-06-04 19:36:28 +00004306/* Charmap encoding: the lookup table */
4307
4308struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004309 PyObject_HEAD
4310 unsigned char level1[32];
4311 int count2, count3;
4312 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004313};
4314
4315static PyObject*
4316encoding_map_size(PyObject *obj, PyObject* args)
4317{
4318 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004319 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004320 128*map->count3);
4321}
4322
4323static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004324 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004325 PyDoc_STR("Return the size (in bytes) of this object") },
4326 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004327};
4328
4329static void
4330encoding_map_dealloc(PyObject* o)
4331{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004332 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004333}
4334
4335static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004336 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004337 "EncodingMap", /*tp_name*/
4338 sizeof(struct encoding_map), /*tp_basicsize*/
4339 0, /*tp_itemsize*/
4340 /* methods */
4341 encoding_map_dealloc, /*tp_dealloc*/
4342 0, /*tp_print*/
4343 0, /*tp_getattr*/
4344 0, /*tp_setattr*/
4345 0, /*tp_compare*/
4346 0, /*tp_repr*/
4347 0, /*tp_as_number*/
4348 0, /*tp_as_sequence*/
4349 0, /*tp_as_mapping*/
4350 0, /*tp_hash*/
4351 0, /*tp_call*/
4352 0, /*tp_str*/
4353 0, /*tp_getattro*/
4354 0, /*tp_setattro*/
4355 0, /*tp_as_buffer*/
4356 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4357 0, /*tp_doc*/
4358 0, /*tp_traverse*/
4359 0, /*tp_clear*/
4360 0, /*tp_richcompare*/
4361 0, /*tp_weaklistoffset*/
4362 0, /*tp_iter*/
4363 0, /*tp_iternext*/
4364 encoding_map_methods, /*tp_methods*/
4365 0, /*tp_members*/
4366 0, /*tp_getset*/
4367 0, /*tp_base*/
4368 0, /*tp_dict*/
4369 0, /*tp_descr_get*/
4370 0, /*tp_descr_set*/
4371 0, /*tp_dictoffset*/
4372 0, /*tp_init*/
4373 0, /*tp_alloc*/
4374 0, /*tp_new*/
4375 0, /*tp_free*/
4376 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004377};
4378
4379PyObject*
4380PyUnicode_BuildEncodingMap(PyObject* string)
4381{
4382 Py_UNICODE *decode;
4383 PyObject *result;
4384 struct encoding_map *mresult;
4385 int i;
4386 int need_dict = 0;
4387 unsigned char level1[32];
4388 unsigned char level2[512];
4389 unsigned char *mlevel1, *mlevel2, *mlevel3;
4390 int count2 = 0, count3 = 0;
4391
4392 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4393 PyErr_BadArgument();
4394 return NULL;
4395 }
4396 decode = PyUnicode_AS_UNICODE(string);
4397 memset(level1, 0xFF, sizeof level1);
4398 memset(level2, 0xFF, sizeof level2);
4399
4400 /* If there isn't a one-to-one mapping of NULL to \0,
4401 or if there are non-BMP characters, we need to use
4402 a mapping dictionary. */
4403 if (decode[0] != 0)
4404 need_dict = 1;
4405 for (i = 1; i < 256; i++) {
4406 int l1, l2;
4407 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004408#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004409 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004410#endif
4411 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004412 need_dict = 1;
4413 break;
4414 }
4415 if (decode[i] == 0xFFFE)
4416 /* unmapped character */
4417 continue;
4418 l1 = decode[i] >> 11;
4419 l2 = decode[i] >> 7;
4420 if (level1[l1] == 0xFF)
4421 level1[l1] = count2++;
4422 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004423 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004424 }
4425
4426 if (count2 >= 0xFF || count3 >= 0xFF)
4427 need_dict = 1;
4428
4429 if (need_dict) {
4430 PyObject *result = PyDict_New();
4431 PyObject *key, *value;
4432 if (!result)
4433 return NULL;
4434 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004435 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004436 key = PyInt_FromLong(decode[i]);
4437 value = PyInt_FromLong(i);
4438 if (!key || !value)
4439 goto failed1;
4440 if (PyDict_SetItem(result, key, value) == -1)
4441 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004442 Py_DECREF(key);
4443 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004444 }
4445 return result;
4446 failed1:
4447 Py_XDECREF(key);
4448 Py_XDECREF(value);
4449 Py_DECREF(result);
4450 return NULL;
4451 }
4452
4453 /* Create a three-level trie */
4454 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4455 16*count2 + 128*count3 - 1);
4456 if (!result)
4457 return PyErr_NoMemory();
4458 PyObject_Init(result, &EncodingMapType);
4459 mresult = (struct encoding_map*)result;
4460 mresult->count2 = count2;
4461 mresult->count3 = count3;
4462 mlevel1 = mresult->level1;
4463 mlevel2 = mresult->level23;
4464 mlevel3 = mresult->level23 + 16*count2;
4465 memcpy(mlevel1, level1, 32);
4466 memset(mlevel2, 0xFF, 16*count2);
4467 memset(mlevel3, 0, 128*count3);
4468 count3 = 0;
4469 for (i = 1; i < 256; i++) {
4470 int o1, o2, o3, i2, i3;
4471 if (decode[i] == 0xFFFE)
4472 /* unmapped character */
4473 continue;
4474 o1 = decode[i]>>11;
4475 o2 = (decode[i]>>7) & 0xF;
4476 i2 = 16*mlevel1[o1] + o2;
4477 if (mlevel2[i2] == 0xFF)
4478 mlevel2[i2] = count3++;
4479 o3 = decode[i] & 0x7F;
4480 i3 = 128*mlevel2[i2] + o3;
4481 mlevel3[i3] = i;
4482 }
4483 return result;
4484}
4485
4486static int
4487encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4488{
4489 struct encoding_map *map = (struct encoding_map*)mapping;
4490 int l1 = c>>11;
4491 int l2 = (c>>7) & 0xF;
4492 int l3 = c & 0x7F;
4493 int i;
4494
4495#ifdef Py_UNICODE_WIDE
4496 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004497 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004498 }
4499#endif
4500 if (c == 0)
4501 return 0;
4502 /* level 1*/
4503 i = map->level1[l1];
4504 if (i == 0xFF) {
4505 return -1;
4506 }
4507 /* level 2*/
4508 i = map->level23[16*i+l2];
4509 if (i == 0xFF) {
4510 return -1;
4511 }
4512 /* level 3 */
4513 i = map->level23[16*map->count2 + 128*i + l3];
4514 if (i == 0) {
4515 return -1;
4516 }
4517 return i;
4518}
4519
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520/* Lookup the character ch in the mapping. If the character
4521 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004522 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 PyObject *w = PyInt_FromLong((long)c);
4526 PyObject *x;
4527
4528 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004529 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 x = PyObject_GetItem(mapping, w);
4531 Py_DECREF(w);
4532 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004533 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4534 /* No mapping found means: mapping is undefined. */
4535 PyErr_Clear();
4536 x = Py_None;
4537 Py_INCREF(x);
4538 return x;
4539 } else
4540 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004542 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004543 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004545 long value = PyInt_AS_LONG(x);
4546 if (value < 0 || value > 255) {
4547 PyErr_SetString(PyExc_TypeError,
4548 "character mapping must be in range(256)");
4549 Py_DECREF(x);
4550 return NULL;
4551 }
4552 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004554 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004555 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004557 /* wrong return value */
4558 PyErr_SetString(PyExc_TypeError,
4559 "character mapping must return integer, None or str");
4560 Py_DECREF(x);
4561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 }
4563}
4564
Martin v. Löwis3f767792006-06-04 19:36:28 +00004565static int
4566charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4567{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004568 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4569 /* exponentially overallocate to minimize reallocations */
4570 if (requiredsize < 2*outsize)
4571 requiredsize = 2*outsize;
4572 if (_PyString_Resize(outobj, requiredsize)) {
4573 return 0;
4574 }
4575 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004576}
4577
Benjamin Peterson857ce152009-01-31 16:29:18 +00004578typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004579 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004580}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581/* lookup the character, put the result in the output string and adjust
4582 various state variables. Reallocate the output string if not enough
4583 space is available. Return a new reference to the object that
4584 was put in the output buffer, or Py_None, if the mapping was undefined
4585 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004586 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004588charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004589 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004591 PyObject *rep;
4592 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004593 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594
Christian Heimese93237d2007-12-19 02:37:44 +00004595 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004596 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004597 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004598 if (res == -1)
4599 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004600 if (outsize<requiredsize)
4601 if (!charmapencode_resize(outobj, outpos, requiredsize))
4602 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004603 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004604 outstart[(*outpos)++] = (char)res;
4605 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004606 }
4607
4608 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004610 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004611 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004612 Py_DECREF(rep);
4613 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004614 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004615 if (PyInt_Check(rep)) {
4616 Py_ssize_t requiredsize = *outpos+1;
4617 if (outsize<requiredsize)
4618 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4619 Py_DECREF(rep);
4620 return enc_EXCEPTION;
4621 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004622 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004623 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004624 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004625 else {
4626 const char *repchars = PyString_AS_STRING(rep);
4627 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4628 Py_ssize_t requiredsize = *outpos+repsize;
4629 if (outsize<requiredsize)
4630 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4631 Py_DECREF(rep);
4632 return enc_EXCEPTION;
4633 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004634 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004635 memcpy(outstart + *outpos, repchars, repsize);
4636 *outpos += repsize;
4637 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 }
Georg Brandl9f167602006-06-04 21:46:16 +00004639 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004640 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641}
4642
4643/* handle an error in PyUnicode_EncodeCharmap
4644 Return 0 on success, -1 on error */
4645static
4646int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004647 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004649 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004650 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651{
4652 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004653 Py_ssize_t repsize;
4654 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 Py_UNICODE *uni2;
4656 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004657 Py_ssize_t collstartpos = *inpos;
4658 Py_ssize_t collendpos = *inpos+1;
4659 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660 char *encoding = "charmap";
4661 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004662 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 /* find all unencodable characters */
4665 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004666 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004667 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004668 int res = encoding_map_lookup(p[collendpos], mapping);
4669 if (res != -1)
4670 break;
4671 ++collendpos;
4672 continue;
4673 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004674
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004675 rep = charmapencode_lookup(p[collendpos], mapping);
4676 if (rep==NULL)
4677 return -1;
4678 else if (rep!=Py_None) {
4679 Py_DECREF(rep);
4680 break;
4681 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004682 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004683 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 }
4685 /* cache callback name lookup
4686 * (if not done yet, i.e. it's the first error) */
4687 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004688 if ((errors==NULL) || (!strcmp(errors, "strict")))
4689 *known_errorHandler = 1;
4690 else if (!strcmp(errors, "replace"))
4691 *known_errorHandler = 2;
4692 else if (!strcmp(errors, "ignore"))
4693 *known_errorHandler = 3;
4694 else if (!strcmp(errors, "xmlcharrefreplace"))
4695 *known_errorHandler = 4;
4696 else
4697 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 }
4699 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004700 case 1: /* strict */
4701 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4702 return -1;
4703 case 2: /* replace */
4704 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004705 x = charmapencode_output('?', mapping, res, respos);
4706 if (x==enc_EXCEPTION) {
4707 return -1;
4708 }
4709 else if (x==enc_FAILED) {
4710 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4711 return -1;
4712 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004713 }
4714 /* fall through */
4715 case 3: /* ignore */
4716 *inpos = collendpos;
4717 break;
4718 case 4: /* xmlcharrefreplace */
Serhiy Storchakae822b032013-08-06 16:56:26 +03004719 /* generate replacement */
4720 for (collpos = collstartpos; collpos < collendpos;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004721 char buffer[2+29+1+1];
4722 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03004723 Py_UCS4 ch = p[collpos++];
4724#ifndef Py_UNICODE_WIDE
4725 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4726 (collpos < collendpos) &&
4727 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4728 ch = ((((ch & 0x03FF) << 10) |
4729 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4730 }
4731#endif
4732 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004733 for (cp = buffer; *cp; ++cp) {
4734 x = charmapencode_output(*cp, mapping, res, respos);
4735 if (x==enc_EXCEPTION)
4736 return -1;
4737 else if (x==enc_FAILED) {
4738 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4739 return -1;
4740 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004741 }
4742 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004743 *inpos = collendpos;
4744 break;
4745 default:
4746 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004747 encoding, reason, p, size, exceptionObject,
4748 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004749 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004750 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004751 /* generate replacement */
4752 repsize = PyUnicode_GET_SIZE(repunicode);
4753 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004754 x = charmapencode_output(*uni2, mapping, res, respos);
4755 if (x==enc_EXCEPTION) {
4756 return -1;
4757 }
4758 else if (x==enc_FAILED) {
4759 Py_DECREF(repunicode);
4760 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4761 return -1;
4762 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004763 }
4764 *inpos = newpos;
4765 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 }
4767 return 0;
4768}
4769
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004771 Py_ssize_t size,
4772 PyObject *mapping,
4773 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775 /* output object */
4776 PyObject *res = NULL;
4777 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004778 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004779 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004780 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781 PyObject *errorHandler = NULL;
4782 PyObject *exc = NULL;
4783 /* the following variable is used for caching string comparisons
4784 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4785 * 3=ignore, 4=xmlcharrefreplace */
4786 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787
4788 /* Default to Latin-1 */
4789 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004790 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792 /* allocate enough for a simple encoding without
4793 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004794 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 if (res == NULL)
4796 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004797 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004798 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004801 /* try to encode it */
4802 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4803 if (x==enc_EXCEPTION) /* error */
4804 goto onError;
4805 if (x==enc_FAILED) { /* unencodable character */
4806 if (charmap_encoding_error(p, size, &inpos, mapping,
4807 &exc,
4808 &known_errorHandler, &errorHandler, errors,
4809 &res, &respos)) {
4810 goto onError;
4811 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004812 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004813 else
4814 /* done with this character => adjust input position */
4815 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004819 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004820 if (_PyString_Resize(&res, respos))
4821 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 }
4823 Py_XDECREF(exc);
4824 Py_XDECREF(errorHandler);
4825 return res;
4826
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004827 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 Py_XDECREF(res);
4829 Py_XDECREF(exc);
4830 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 return NULL;
4832}
4833
4834PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004835 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836{
4837 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004838 PyErr_BadArgument();
4839 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 }
4841 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004842 PyUnicode_GET_SIZE(unicode),
4843 mapping,
4844 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845}
4846
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004847/* create or adjust a UnicodeTranslateError */
4848static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004849 const Py_UNICODE *unicode, Py_ssize_t size,
4850 Py_ssize_t startpos, Py_ssize_t endpos,
4851 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004854 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004855 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 }
4857 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004858 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4859 goto onError;
4860 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4861 goto onError;
4862 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4863 goto onError;
4864 return;
4865 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02004866 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867 }
4868}
4869
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870/* raises a UnicodeTranslateError */
4871static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004872 const Py_UNICODE *unicode, Py_ssize_t size,
4873 Py_ssize_t startpos, Py_ssize_t endpos,
4874 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875{
4876 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004877 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004879 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880}
4881
4882/* error handling callback helper:
4883 build arguments, call the callback and check the arguments,
4884 put the result into newpos and return the replacement string, which
4885 has to be freed by the caller */
4886static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004887 PyObject **errorHandler,
4888 const char *reason,
4889 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4890 Py_ssize_t startpos, Py_ssize_t endpos,
4891 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004893 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004894
Martin v. Löwis412fb672006-04-13 06:34:32 +00004895 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 PyObject *restuple;
4897 PyObject *resunicode;
4898
4899 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004900 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004901 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004902 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903 }
4904
4905 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004906 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004908 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004909
4910 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004911 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004912 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004913 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004915 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004916 Py_DECREF(restuple);
4917 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918 }
4919 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004920 &resunicode, &i_newpos)) {
4921 Py_DECREF(restuple);
4922 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004924 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004925 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004926 else
4927 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004928 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004929 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4930 Py_DECREF(restuple);
4931 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004933 Py_INCREF(resunicode);
4934 Py_DECREF(restuple);
4935 return resunicode;
4936}
4937
4938/* Lookup the character ch in the mapping and put the result in result,
4939 which must be decrefed by the caller.
4940 Return 0 on success, -1 on error */
4941static
4942int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4943{
4944 PyObject *w = PyInt_FromLong((long)c);
4945 PyObject *x;
4946
4947 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004948 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 x = PyObject_GetItem(mapping, w);
4950 Py_DECREF(w);
4951 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004952 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4953 /* No mapping found means: use 1:1 mapping. */
4954 PyErr_Clear();
4955 *result = NULL;
4956 return 0;
4957 } else
4958 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 }
4960 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004961 *result = x;
4962 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004963 }
4964 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004965 long value = PyInt_AS_LONG(x);
4966 long max = PyUnicode_GetMax();
4967 if (value < 0 || value > max) {
4968 PyErr_Format(PyExc_TypeError,
4969 "character mapping must be in range(0x%lx)", max+1);
4970 Py_DECREF(x);
4971 return -1;
4972 }
4973 *result = x;
4974 return 0;
4975 }
4976 else if (PyUnicode_Check(x)) {
4977 *result = x;
4978 return 0;
4979 }
4980 else {
4981 /* wrong return value */
4982 PyErr_SetString(PyExc_TypeError,
4983 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004984 Py_DECREF(x);
4985 return -1;
4986 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004987}
4988/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004989 if not reallocate and adjust various state variables.
4990 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004991static
Walter Dörwald4894c302003-10-24 14:25:28 +00004992int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004993 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004995 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004996 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004997 /* remember old output position */
4998 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4999 /* exponentially overallocate to minimize reallocations */
5000 if (requiredsize < 2 * oldsize)
5001 requiredsize = 2 * oldsize;
5002 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5003 return -1;
5004 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005005 }
5006 return 0;
5007}
5008/* lookup the character, put the result in the output string and adjust
5009 various state variables. Return a new reference to the object that
5010 was put in the output buffer in *result, or Py_None, if the mapping was
5011 undefined (in which case no character was written).
5012 The called must decref result.
5013 Return 0 on success, -1 on error. */
5014static
Walter Dörwald4894c302003-10-24 14:25:28 +00005015int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005016 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5017 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005018{
Walter Dörwald4894c302003-10-24 14:25:28 +00005019 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005020 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005021 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005022 /* not found => default to 1:1 mapping */
5023 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005024 }
5025 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005026 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005027 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005028 /* no overflow check, because we know that the space is enough */
5029 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005030 }
5031 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005032 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5033 if (repsize==1) {
5034 /* no overflow check, because we know that the space is enough */
5035 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5036 }
5037 else if (repsize!=0) {
5038 /* more than one character */
5039 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5040 (insize - (curinp-startinp)) +
5041 repsize - 1;
5042 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5043 return -1;
5044 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5045 *outp += repsize;
5046 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005047 }
5048 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005049 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005050 return 0;
5051}
5052
5053PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005054 Py_ssize_t size,
5055 PyObject *mapping,
5056 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058 /* output object */
5059 PyObject *res = NULL;
5060 /* pointers to the beginning and end+1 of input */
5061 const Py_UNICODE *startp = p;
5062 const Py_UNICODE *endp = p + size;
5063 /* pointer into the output */
5064 Py_UNICODE *str;
5065 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005066 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005067 char *reason = "character maps to <undefined>";
5068 PyObject *errorHandler = NULL;
5069 PyObject *exc = NULL;
5070 /* the following variable is used for caching string comparisons
5071 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5072 * 3=ignore, 4=xmlcharrefreplace */
5073 int known_errorHandler = -1;
5074
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005076 PyErr_BadArgument();
5077 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079
5080 /* allocate enough for a simple 1:1 translation without
5081 replacements, if we need more, we'll resize */
5082 res = PyUnicode_FromUnicode(NULL, size);
5083 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005084 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005086 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005087 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005089 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005090 /* try to encode it */
5091 PyObject *x = NULL;
5092 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5093 Py_XDECREF(x);
5094 goto onError;
5095 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005096 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005097 if (x!=Py_None) /* it worked => adjust input pointer */
5098 ++p;
5099 else { /* untranslatable character */
5100 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5101 Py_ssize_t repsize;
5102 Py_ssize_t newpos;
5103 Py_UNICODE *uni2;
5104 /* startpos for collecting untranslatable chars */
5105 const Py_UNICODE *collstart = p;
5106 const Py_UNICODE *collend = p+1;
5107 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005109 /* find all untranslatable characters */
5110 while (collend < endp) {
5111 if (charmaptranslate_lookup(*collend, mapping, &x))
5112 goto onError;
5113 Py_XDECREF(x);
5114 if (x!=Py_None)
5115 break;
5116 ++collend;
5117 }
5118 /* cache callback name lookup
5119 * (if not done yet, i.e. it's the first error) */
5120 if (known_errorHandler==-1) {
5121 if ((errors==NULL) || (!strcmp(errors, "strict")))
5122 known_errorHandler = 1;
5123 else if (!strcmp(errors, "replace"))
5124 known_errorHandler = 2;
5125 else if (!strcmp(errors, "ignore"))
5126 known_errorHandler = 3;
5127 else if (!strcmp(errors, "xmlcharrefreplace"))
5128 known_errorHandler = 4;
5129 else
5130 known_errorHandler = 0;
5131 }
5132 switch (known_errorHandler) {
5133 case 1: /* strict */
5134 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005135 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005136 case 2: /* replace */
5137 /* No need to check for space, this is a 1:1 replacement */
5138 for (coll = collstart; coll<collend; ++coll)
5139 *str++ = '?';
5140 /* fall through */
5141 case 3: /* ignore */
5142 p = collend;
5143 break;
5144 case 4: /* xmlcharrefreplace */
5145 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005146 for (p = collstart; p < collend;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005147 char buffer[2+29+1+1];
5148 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03005149 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5150 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005151 if (charmaptranslate_makespace(&res, &str,
5152 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5153 goto onError;
5154 for (cp = buffer; *cp; ++cp)
5155 *str++ = *cp;
5156 }
5157 p = collend;
5158 break;
5159 default:
5160 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5161 reason, startp, size, &exc,
5162 collstart-startp, collend-startp, &newpos);
5163 if (repunicode == NULL)
5164 goto onError;
5165 /* generate replacement */
5166 repsize = PyUnicode_GET_SIZE(repunicode);
5167 if (charmaptranslate_makespace(&res, &str,
5168 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5169 Py_DECREF(repunicode);
5170 goto onError;
5171 }
5172 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5173 *str++ = *uni2;
5174 p = startp + newpos;
5175 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005176 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005177 }
5178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 /* Resize if we allocated to much */
5180 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005181 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005182 if (PyUnicode_Resize(&res, respos) < 0)
5183 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005184 }
5185 Py_XDECREF(exc);
5186 Py_XDECREF(errorHandler);
5187 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005189 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005190 Py_XDECREF(res);
5191 Py_XDECREF(exc);
5192 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 return NULL;
5194}
5195
5196PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005197 PyObject *mapping,
5198 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199{
5200 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005201
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 str = PyUnicode_FromObject(str);
5203 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005204 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005206 PyUnicode_GET_SIZE(str),
5207 mapping,
5208 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 Py_DECREF(str);
5210 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005212 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 Py_XDECREF(str);
5214 return NULL;
5215}
Tim Petersced69f82003-09-16 20:30:58 +00005216
Guido van Rossum9e896b32000-04-05 20:11:21 +00005217/* --- Decimal Encoder ---------------------------------------------------- */
5218
5219int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005220 Py_ssize_t length,
5221 char *output,
5222 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005223{
5224 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005225 PyObject *errorHandler = NULL;
5226 PyObject *exc = NULL;
5227 const char *encoding = "decimal";
5228 const char *reason = "invalid decimal Unicode string";
5229 /* the following variable is used for caching string comparisons
5230 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5231 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005232
5233 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005234 PyErr_BadArgument();
5235 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005236 }
5237
5238 p = s;
5239 end = s + length;
5240 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005241 register Py_UNICODE ch = *p;
5242 int decimal;
5243 PyObject *repunicode;
5244 Py_ssize_t repsize;
5245 Py_ssize_t newpos;
5246 Py_UNICODE *uni2;
5247 Py_UNICODE *collstart;
5248 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005249
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005250 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005251 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005252 ++p;
5253 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005254 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005255 decimal = Py_UNICODE_TODECIMAL(ch);
5256 if (decimal >= 0) {
5257 *output++ = '0' + decimal;
5258 ++p;
5259 continue;
5260 }
5261 if (0 < ch && ch < 256) {
5262 *output++ = (char)ch;
5263 ++p;
5264 continue;
5265 }
5266 /* All other characters are considered unencodable */
5267 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005268 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005269 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005270 Py_UNICODE_ISSPACE(*collend) ||
5271 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005272 break;
5273 }
5274 /* cache callback name lookup
5275 * (if not done yet, i.e. it's the first error) */
5276 if (known_errorHandler==-1) {
5277 if ((errors==NULL) || (!strcmp(errors, "strict")))
5278 known_errorHandler = 1;
5279 else if (!strcmp(errors, "replace"))
5280 known_errorHandler = 2;
5281 else if (!strcmp(errors, "ignore"))
5282 known_errorHandler = 3;
5283 else if (!strcmp(errors, "xmlcharrefreplace"))
5284 known_errorHandler = 4;
5285 else
5286 known_errorHandler = 0;
5287 }
5288 switch (known_errorHandler) {
5289 case 1: /* strict */
5290 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5291 goto onError;
5292 case 2: /* replace */
5293 for (p = collstart; p < collend; ++p)
5294 *output++ = '?';
5295 /* fall through */
5296 case 3: /* ignore */
5297 p = collend;
5298 break;
5299 case 4: /* xmlcharrefreplace */
5300 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005301 for (p = collstart; p < collend;) {
5302 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5303 output += sprintf(output, "&#%d;", ch);
5304 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005305 p = collend;
5306 break;
5307 default:
5308 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5309 encoding, reason, s, length, &exc,
5310 collstart-s, collend-s, &newpos);
5311 if (repunicode == NULL)
5312 goto onError;
5313 /* generate replacement */
5314 repsize = PyUnicode_GET_SIZE(repunicode);
5315 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5316 Py_UNICODE ch = *uni2;
5317 if (Py_UNICODE_ISSPACE(ch))
5318 *output++ = ' ';
5319 else {
5320 decimal = Py_UNICODE_TODECIMAL(ch);
5321 if (decimal >= 0)
5322 *output++ = '0' + decimal;
5323 else if (0 < ch && ch < 256)
5324 *output++ = (char)ch;
5325 else {
5326 Py_DECREF(repunicode);
5327 raise_encode_exception(&exc, encoding,
5328 s, length, collstart-s, collend-s, reason);
5329 goto onError;
5330 }
5331 }
5332 }
5333 p = s + newpos;
5334 Py_DECREF(repunicode);
5335 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005336 }
5337 /* 0-terminate the output string */
5338 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005339 Py_XDECREF(exc);
5340 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005341 return 0;
5342
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005343 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005344 Py_XDECREF(exc);
5345 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005346 return -1;
5347}
5348
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349/* --- Helpers ------------------------------------------------------------ */
5350
Eric Smitha9f7d622008-02-17 19:46:49 +00005351#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005352#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005353
5354#include "stringlib/count.h"
5355#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005356#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005357#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005358
Fredrik Lundhc8162812006-05-26 19:33:03 +00005359/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005360#define ADJUST_INDICES(start, end, len) \
5361 if (end > len) \
5362 end = len; \
5363 else if (end < 0) { \
5364 end += len; \
5365 if (end < 0) \
5366 end = 0; \
5367 } \
5368 if (start < 0) { \
5369 start += len; \
5370 if (start < 0) \
5371 start = 0; \
5372 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005373
Martin v. Löwis18e16552006-02-15 17:27:45 +00005374Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005375 PyObject *substr,
5376 Py_ssize_t start,
5377 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005379 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005380 PyUnicodeObject* str_obj;
5381 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005382
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005383 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5384 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005385 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005386 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5387 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005388 Py_DECREF(str_obj);
5389 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 }
Tim Petersced69f82003-09-16 20:30:58 +00005391
Antoine Pitrou64672132010-01-13 07:55:48 +00005392 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005393 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005394 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5395 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005396 );
5397
5398 Py_DECREF(sub_obj);
5399 Py_DECREF(str_obj);
5400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 return result;
5402}
5403
Martin v. Löwis18e16552006-02-15 17:27:45 +00005404Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005405 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005406 Py_ssize_t start,
5407 Py_ssize_t end,
5408 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005410 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005411
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005412 str = PyUnicode_FromObject(str);
5413 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005414 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005415 sub = PyUnicode_FromObject(sub);
5416 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005417 Py_DECREF(str);
5418 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 }
Tim Petersced69f82003-09-16 20:30:58 +00005420
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005421 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005422 result = stringlib_find_slice(
5423 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5424 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5425 start, end
5426 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005427 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005428 result = stringlib_rfind_slice(
5429 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5430 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5431 start, end
5432 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005433
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005434 Py_DECREF(str);
5435 Py_DECREF(sub);
5436
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 return result;
5438}
5439
Tim Petersced69f82003-09-16 20:30:58 +00005440static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005442 PyUnicodeObject *substring,
5443 Py_ssize_t start,
5444 Py_ssize_t end,
5445 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 if (substring->length == 0)
5448 return 1;
5449
Antoine Pitrou64672132010-01-13 07:55:48 +00005450 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 end -= substring->length;
5452 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005453 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454
5455 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005456 if (Py_UNICODE_MATCH(self, end, substring))
5457 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 } else {
5459 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005460 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 }
5462
5463 return 0;
5464}
5465
Martin v. Löwis18e16552006-02-15 17:27:45 +00005466Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005467 PyObject *substr,
5468 Py_ssize_t start,
5469 Py_ssize_t end,
5470 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005472 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005473
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 str = PyUnicode_FromObject(str);
5475 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005476 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 substr = PyUnicode_FromObject(substr);
5478 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005479 Py_DECREF(str);
5480 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 }
Tim Petersced69f82003-09-16 20:30:58 +00005482
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005484 (PyUnicodeObject *)substr,
5485 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 Py_DECREF(str);
5487 Py_DECREF(substr);
5488 return result;
5489}
5490
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491/* Apply fixfct filter to the Unicode object self and return a
5492 reference to the modified object */
5493
Tim Petersced69f82003-09-16 20:30:58 +00005494static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005496 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497{
5498
5499 PyUnicodeObject *u;
5500
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005501 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005503 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005504
5505 Py_UNICODE_COPY(u->str, self->str, self->length);
5506
Tim Peters7a29bd52001-09-12 03:03:31 +00005507 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005508 /* fixfct should return TRUE if it modified the buffer. If
5509 FALSE, return a reference to the original buffer instead
5510 (to save space, not time) */
5511 Py_INCREF(self);
5512 Py_DECREF(u);
5513 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 }
5515 return (PyObject*) u;
5516}
5517
Tim Petersced69f82003-09-16 20:30:58 +00005518static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519int fixupper(PyUnicodeObject *self)
5520{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005521 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 Py_UNICODE *s = self->str;
5523 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005524
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005526 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005527
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005528 ch = Py_UNICODE_TOUPPER(*s);
5529 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005531 *s = ch;
5532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 s++;
5534 }
5535
5536 return status;
5537}
5538
Tim Petersced69f82003-09-16 20:30:58 +00005539static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540int fixlower(PyUnicodeObject *self)
5541{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005542 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 Py_UNICODE *s = self->str;
5544 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005545
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005547 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005548
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005549 ch = Py_UNICODE_TOLOWER(*s);
5550 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005552 *s = ch;
5553 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 s++;
5555 }
5556
5557 return status;
5558}
5559
Tim Petersced69f82003-09-16 20:30:58 +00005560static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561int fixswapcase(PyUnicodeObject *self)
5562{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005563 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 Py_UNICODE *s = self->str;
5565 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005566
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 while (len-- > 0) {
5568 if (Py_UNICODE_ISUPPER(*s)) {
5569 *s = Py_UNICODE_TOLOWER(*s);
5570 status = 1;
5571 } else if (Py_UNICODE_ISLOWER(*s)) {
5572 *s = Py_UNICODE_TOUPPER(*s);
5573 status = 1;
5574 }
5575 s++;
5576 }
5577
5578 return status;
5579}
5580
Tim Petersced69f82003-09-16 20:30:58 +00005581static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582int fixcapitalize(PyUnicodeObject *self)
5583{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005584 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005585 Py_UNICODE *s = self->str;
5586 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005587
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005588 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005589 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005590 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005591 *s = Py_UNICODE_TOUPPER(*s);
5592 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005594 s++;
5595 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005596 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005597 *s = Py_UNICODE_TOLOWER(*s);
5598 status = 1;
5599 }
5600 s++;
5601 }
5602 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603}
5604
5605static
5606int fixtitle(PyUnicodeObject *self)
5607{
5608 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5609 register Py_UNICODE *e;
5610 int previous_is_cased;
5611
5612 /* Shortcut for single character strings */
5613 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005614 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5615 if (*p != ch) {
5616 *p = ch;
5617 return 1;
5618 }
5619 else
5620 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 }
Tim Petersced69f82003-09-16 20:30:58 +00005622
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 e = p + PyUnicode_GET_SIZE(self);
5624 previous_is_cased = 0;
5625 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005626 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005627
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005628 if (previous_is_cased)
5629 *p = Py_UNICODE_TOLOWER(ch);
5630 else
5631 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005632
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005633 if (Py_UNICODE_ISLOWER(ch) ||
5634 Py_UNICODE_ISUPPER(ch) ||
5635 Py_UNICODE_ISTITLE(ch))
5636 previous_is_cased = 1;
5637 else
5638 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 }
5640 return 1;
5641}
5642
Tim Peters8ce9f162004-08-27 01:49:32 +00005643PyObject *
5644PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645{
Tim Peters8ce9f162004-08-27 01:49:32 +00005646 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005647 const Py_UNICODE blank = ' ';
5648 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005649 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005650 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005651 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5652 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005653 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5654 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005655 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005656 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005657 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658
Benjamin Peterson7a91bf82014-02-15 13:02:52 -05005659 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00005660 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005661 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005662 }
5663
Tim Peters91879ab2004-08-27 22:35:44 +00005664 /* Grrrr. A codec may be invoked to convert str objects to
5665 * Unicode, and so it's possible to call back into Python code
5666 * during PyUnicode_FromObject(), and so it's possible for a sick
5667 * codec to change the size of fseq (if seq is a list). Therefore
5668 * we have to keep refetching the size -- can't assume seqlen
5669 * is invariant.
5670 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005671 seqlen = PySequence_Fast_GET_SIZE(fseq);
5672 /* If empty sequence, return u"". */
5673 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005674 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5675 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005676 }
5677 /* If singleton sequence with an exact Unicode, return that. */
5678 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005679 item = PySequence_Fast_GET_ITEM(fseq, 0);
5680 if (PyUnicode_CheckExact(item)) {
5681 Py_INCREF(item);
5682 res = (PyUnicodeObject *)item;
5683 goto Done;
5684 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005685 }
5686
Tim Peters05eba1f2004-08-27 21:32:02 +00005687 /* At least two items to join, or one that isn't exact Unicode. */
5688 if (seqlen > 1) {
5689 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005690 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005691 sep = &blank;
5692 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005693 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005694 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005695 internal_separator = PyUnicode_FromObject(separator);
5696 if (internal_separator == NULL)
5697 goto onError;
5698 sep = PyUnicode_AS_UNICODE(internal_separator);
5699 seplen = PyUnicode_GET_SIZE(internal_separator);
5700 /* In case PyUnicode_FromObject() mutated seq. */
5701 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005702 }
5703 }
5704
5705 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005706 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005707 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005708 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005709 res_p = PyUnicode_AS_UNICODE(res);
5710 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005711
Tim Peters05eba1f2004-08-27 21:32:02 +00005712 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005713 Py_ssize_t itemlen;
5714 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005715
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005716 item = PySequence_Fast_GET_ITEM(fseq, i);
5717 /* Convert item to Unicode. */
5718 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5719 PyErr_Format(PyExc_TypeError,
5720 "sequence item %zd: expected string or Unicode,"
5721 " %.80s found",
5722 i, Py_TYPE(item)->tp_name);
5723 goto onError;
5724 }
5725 item = PyUnicode_FromObject(item);
5726 if (item == NULL)
5727 goto onError;
5728 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005729
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005730 /* In case PyUnicode_FromObject() mutated seq. */
5731 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005732
Tim Peters8ce9f162004-08-27 01:49:32 +00005733 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005734 itemlen = PyUnicode_GET_SIZE(item);
Xiang Zhang7bdb5162017-01-09 11:13:20 +08005735 if (res_used > PY_SSIZE_T_MAX - itemlen)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005736 goto Overflow;
Xiang Zhang7bdb5162017-01-09 11:13:20 +08005737 new_res_used = res_used + itemlen;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005738 if (i < seqlen - 1) {
Xiang Zhang7bdb5162017-01-09 11:13:20 +08005739 if (new_res_used > PY_SSIZE_T_MAX - seplen)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005740 goto Overflow;
Xiang Zhang7bdb5162017-01-09 11:13:20 +08005741 new_res_used += seplen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005742 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005743 if (new_res_used > res_alloc) {
5744 /* double allocated size until it's big enough */
5745 do {
Xiang Zhang7bdb5162017-01-09 11:13:20 +08005746 if (res_alloc > PY_SSIZE_T_MAX / 2)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005747 goto Overflow;
Xiang Zhang7bdb5162017-01-09 11:13:20 +08005748 res_alloc += res_alloc;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005749 } while (new_res_used > res_alloc);
5750 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5751 Py_DECREF(item);
5752 goto onError;
5753 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005754 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005755 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005756
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005757 /* Copy item, and maybe the separator. */
5758 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5759 res_p += itemlen;
5760 if (i < seqlen - 1) {
5761 Py_UNICODE_COPY(res_p, sep, seplen);
5762 res_p += seplen;
5763 }
5764 Py_DECREF(item);
5765 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005766 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005767
Tim Peters05eba1f2004-08-27 21:32:02 +00005768 /* Shrink res to match the used area; this probably can't fail,
5769 * but it's cheap to check.
5770 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005771 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005772 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005773
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005774 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005775 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005776 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 return (PyObject *)res;
5778
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005779 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005780 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005781 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005782 Py_DECREF(item);
5783 /* fall through */
5784
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005785 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005786 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005787 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005788 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 return NULL;
5790}
5791
Tim Petersced69f82003-09-16 20:30:58 +00005792static
5793PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005794 Py_ssize_t left,
5795 Py_ssize_t right,
5796 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797{
5798 PyUnicodeObject *u;
5799
5800 if (left < 0)
5801 left = 0;
5802 if (right < 0)
5803 right = 0;
5804
Tim Peters7a29bd52001-09-12 03:03:31 +00005805 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 Py_INCREF(self);
5807 return self;
5808 }
5809
Neal Norwitze7d8be82008-07-31 17:17:14 +00005810 if (left > PY_SSIZE_T_MAX - self->length ||
5811 right > PY_SSIZE_T_MAX - (left + self->length)) {
5812 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5813 return NULL;
5814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 u = _PyUnicode_New(left + self->length + right);
5816 if (u) {
5817 if (left)
5818 Py_UNICODE_FILL(u->str, fill, left);
5819 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5820 if (right)
5821 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5822 }
5823
5824 return u;
5825}
5826
Antoine Pitrou64672132010-01-13 07:55:48 +00005827PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830
5831 string = PyUnicode_FromObject(string);
5832 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834
Antoine Pitrou64672132010-01-13 07:55:48 +00005835 list = stringlib_splitlines(
5836 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5837 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838
5839 Py_DECREF(string);
5840 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841}
5842
Tim Petersced69f82003-09-16 20:30:58 +00005843static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005845 PyUnicodeObject *substring,
5846 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005849 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005852 return stringlib_split_whitespace(
5853 (PyObject*) self, self->str, self->length, maxcount
5854 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855
Antoine Pitrou64672132010-01-13 07:55:48 +00005856 return stringlib_split(
5857 (PyObject*) self, self->str, self->length,
5858 substring->str, substring->length,
5859 maxcount
5860 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861}
5862
Tim Petersced69f82003-09-16 20:30:58 +00005863static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005864PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005865 PyUnicodeObject *substring,
5866 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005867{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005868 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005869 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005870
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005871 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005872 return stringlib_rsplit_whitespace(
5873 (PyObject*) self, self->str, self->length, maxcount
5874 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005875
Antoine Pitrou64672132010-01-13 07:55:48 +00005876 return stringlib_rsplit(
5877 (PyObject*) self, self->str, self->length,
5878 substring->str, substring->length,
5879 maxcount
5880 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005881}
5882
5883static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005885 PyUnicodeObject *str1,
5886 PyUnicodeObject *str2,
5887 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888{
5889 PyUnicodeObject *u;
5890
5891 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005892 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005893 else if (maxcount == 0 || self->length == 0)
5894 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895
Fredrik Lundh347ee272006-05-24 16:35:18 +00005896 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005897 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005898 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005899 if (str1->length == 0)
5900 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005901 if (str1->length == 1) {
5902 /* replace characters */
5903 Py_UNICODE u1, u2;
5904 if (!findchar(self->str, self->length, str1->str[0]))
5905 goto nothing;
5906 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5907 if (!u)
5908 return NULL;
5909 Py_UNICODE_COPY(u->str, self->str, self->length);
5910 u1 = str1->str[0];
5911 u2 = str2->str[0];
5912 for (i = 0; i < u->length; i++)
5913 if (u->str[i] == u1) {
5914 if (--maxcount < 0)
5915 break;
5916 u->str[i] = u2;
5917 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005919 i = stringlib_find(
5920 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005922 if (i < 0)
5923 goto nothing;
5924 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5925 if (!u)
5926 return NULL;
5927 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005928
5929 /* change everything in-place, starting with this one */
5930 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5931 i += str1->length;
5932
5933 while ( --maxcount > 0) {
5934 i = stringlib_find(self->str+i, self->length-i,
5935 str1->str, str1->length,
5936 i);
5937 if (i == -1)
5938 break;
5939 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5940 i += str1->length;
5941 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005944
Brett Cannona7f13ee2010-05-04 01:16:51 +00005945 Py_ssize_t n, i, j;
Xiang Zhang7bdb5162017-01-09 11:13:20 +08005946 Py_ssize_t new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 Py_UNICODE *p;
5948
5949 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005950 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5951 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005952 if (n == 0)
5953 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005954 /* new_size = self->length + n * (str2->length - str1->length)); */
5955 delta = (str2->length - str1->length);
5956 if (delta == 0) {
5957 new_size = self->length;
5958 } else {
Xiang Zhang7bdb5162017-01-09 11:13:20 +08005959 assert(n > 0);
5960 if (delta > (PY_SSIZE_T_MAX - self->length) / n) {
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005961 PyErr_SetString(PyExc_OverflowError,
5962 "replace string is too long");
5963 return NULL;
5964 }
Xiang Zhang7bdb5162017-01-09 11:13:20 +08005965 new_size = self->length + delta * n;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005966 }
5967 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005968 if (!u)
5969 return NULL;
5970 i = 0;
5971 p = u->str;
5972 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005973 while (n-- > 0) {
5974 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005975 j = stringlib_find(self->str+i, self->length-i,
5976 str1->str, str1->length,
5977 i);
5978 if (j == -1)
5979 break;
5980 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005981 /* copy unchanged part [i:j] */
5982 Py_UNICODE_COPY(p, self->str+i, j-i);
5983 p += j - i;
5984 }
5985 /* copy substitution string */
5986 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005987 Py_UNICODE_COPY(p, str2->str, str2->length);
5988 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005989 }
5990 i = j + str1->length;
5991 }
5992 if (i < self->length)
5993 /* copy tail [i:] */
5994 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005995 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005996 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005997 while (n > 0) {
5998 Py_UNICODE_COPY(p, str2->str, str2->length);
5999 p += str2->length;
6000 if (--n <= 0)
6001 break;
6002 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006004 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 }
6006 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006008
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006009 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006010 /* nothing to replace; return original string (when possible) */
6011 if (PyUnicode_CheckExact(self)) {
6012 Py_INCREF(self);
6013 return (PyObject *) self;
6014 }
6015 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016}
6017
6018/* --- Unicode Object Methods --------------------------------------------- */
6019
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006020PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006021 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022\n\
6023Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006024characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
6026static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006027unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 return fixup(self, fixtitle);
6030}
6031
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006032PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006033 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034\n\
6035Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00006036have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
6038static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006039unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 return fixup(self, fixcapitalize);
6042}
6043
6044#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006045PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006046 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047\n\
6048Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006049normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
6051static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006052unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053{
6054 PyObject *list;
6055 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006056 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 /* Split into words */
6059 list = split(self, NULL, -1);
6060 if (!list)
6061 return NULL;
6062
6063 /* Capitalize each word */
6064 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6065 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006066 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 if (item == NULL)
6068 goto onError;
6069 Py_DECREF(PyList_GET_ITEM(list, i));
6070 PyList_SET_ITEM(list, i, item);
6071 }
6072
6073 /* Join the words to form a new string */
6074 item = PyUnicode_Join(NULL, list);
6075
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006076 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 Py_DECREF(list);
6078 return (PyObject *)item;
6079}
6080#endif
6081
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006082/* Argument converter. Coerces to a single unicode character */
6083
6084static int
6085convert_uc(PyObject *obj, void *addr)
6086{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006087 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6088 PyObject *uniobj;
6089 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006090
Benjamin Peterson857ce152009-01-31 16:29:18 +00006091 uniobj = PyUnicode_FromObject(obj);
6092 if (uniobj == NULL) {
6093 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006094 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006095 return 0;
6096 }
6097 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6098 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006099 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006100 Py_DECREF(uniobj);
6101 return 0;
6102 }
6103 unistr = PyUnicode_AS_UNICODE(uniobj);
6104 *fillcharloc = unistr[0];
6105 Py_DECREF(uniobj);
6106 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006107}
6108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006109PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006110 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006112Return S centered in a Unicode string of length width. Padding is\n\
6113done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114
6115static PyObject *
6116unicode_center(PyUnicodeObject *self, PyObject *args)
6117{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006118 Py_ssize_t marg, left;
6119 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006120 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121
Thomas Woutersde017742006-02-16 19:34:37 +00006122 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 return NULL;
6124
Tim Peters7a29bd52001-09-12 03:03:31 +00006125 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 Py_INCREF(self);
6127 return (PyObject*) self;
6128 }
6129
6130 marg = width - self->length;
6131 left = marg / 2 + (marg & width & 1);
6132
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006133 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134}
6135
Marc-André Lemburge5034372000-08-08 08:04:29 +00006136#if 0
6137
6138/* This code should go into some future Unicode collation support
6139 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006140 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006141
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006142/* speedy UTF-16 code point order comparison */
6143/* gleaned from: */
6144/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6145
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006146static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006147{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006148 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006149 0, 0, 0, 0, 0, 0, 0, 0,
6150 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006151 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006152};
6153
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154static int
6155unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6156{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006157 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006158
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 Py_UNICODE *s1 = str1->str;
6160 Py_UNICODE *s2 = str2->str;
6161
6162 len1 = str1->length;
6163 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006164
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006166 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006167
6168 c1 = *s1++;
6169 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006170
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006171 if (c1 > (1<<11) * 26)
6172 c1 += utf16Fixup[c1>>11];
6173 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006174 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006175 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006176
6177 if (c1 != c2)
6178 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006179
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006180 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 }
6182
6183 return (len1 < len2) ? -1 : (len1 != len2);
6184}
6185
Marc-André Lemburge5034372000-08-08 08:04:29 +00006186#else
6187
6188static int
6189unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6190{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006191 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006192
6193 Py_UNICODE *s1 = str1->str;
6194 Py_UNICODE *s2 = str2->str;
6195
6196 len1 = str1->length;
6197 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006198
Marc-André Lemburge5034372000-08-08 08:04:29 +00006199 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006200 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006201
Fredrik Lundh45714e92001-06-26 16:39:36 +00006202 c1 = *s1++;
6203 c2 = *s2++;
6204
6205 if (c1 != c2)
6206 return (c1 < c2) ? -1 : 1;
6207
Marc-André Lemburge5034372000-08-08 08:04:29 +00006208 len1--; len2--;
6209 }
6210
6211 return (len1 < len2) ? -1 : (len1 != len2);
6212}
6213
6214#endif
6215
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006217 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218{
6219 PyUnicodeObject *u = NULL, *v = NULL;
6220 int result;
6221
6222 /* Coerce the two arguments */
6223 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6224 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6227 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006228 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229
Thomas Wouters7e474022000-07-16 12:04:32 +00006230 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006232 Py_DECREF(u);
6233 Py_DECREF(v);
6234 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 }
6236
6237 result = unicode_compare(u, v);
6238
6239 Py_DECREF(u);
6240 Py_DECREF(v);
6241 return result;
6242
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006243 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 Py_XDECREF(u);
6245 Py_XDECREF(v);
6246 return -1;
6247}
6248
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006249PyObject *PyUnicode_RichCompare(PyObject *left,
6250 PyObject *right,
6251 int op)
6252{
6253 int result;
6254
6255 result = PyUnicode_Compare(left, right);
6256 if (result == -1 && PyErr_Occurred())
6257 goto onError;
6258
6259 /* Convert the return value to a Boolean */
6260 switch (op) {
6261 case Py_EQ:
6262 result = (result == 0);
6263 break;
6264 case Py_NE:
6265 result = (result != 0);
6266 break;
6267 case Py_LE:
6268 result = (result <= 0);
6269 break;
6270 case Py_GE:
6271 result = (result >= 0);
6272 break;
6273 case Py_LT:
6274 result = (result == -1);
6275 break;
6276 case Py_GT:
6277 result = (result == 1);
6278 break;
6279 }
6280 return PyBool_FromLong(result);
6281
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006282 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006283
6284 /* Standard case
6285
6286 Type errors mean that PyUnicode_FromObject() could not convert
6287 one of the arguments (usually the right hand side) to Unicode,
6288 ie. we can't handle the comparison request. However, it is
6289 possible that the other object knows a comparison method, which
6290 is why we return Py_NotImplemented to give the other object a
6291 chance.
6292
6293 */
6294 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6295 PyErr_Clear();
6296 Py_INCREF(Py_NotImplemented);
6297 return Py_NotImplemented;
6298 }
6299 if (op != Py_EQ && op != Py_NE)
6300 return NULL;
6301
6302 /* Equality comparison.
6303
6304 This is a special case: we silence any PyExc_UnicodeDecodeError
6305 and instead turn it into a PyErr_UnicodeWarning.
6306
6307 */
6308 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6309 return NULL;
6310 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006311 if (PyErr_Warn(PyExc_UnicodeWarning,
6312 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006313 "Unicode equal comparison "
6314 "failed to convert both arguments to Unicode - "
6315 "interpreting them as being unequal" :
6316 "Unicode unequal comparison "
6317 "failed to convert both arguments to Unicode - "
6318 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006319 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006320 return NULL;
6321 result = (op == Py_NE);
6322 return PyBool_FromLong(result);
6323}
6324
Guido van Rossum403d68b2000-03-13 15:55:09 +00006325int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006326 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006327{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006328 PyObject *str, *sub;
6329 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006330
6331 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006332 sub = PyUnicode_FromObject(element);
6333 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006334 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006335 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006336
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006337 str = PyUnicode_FromObject(container);
6338 if (!str) {
6339 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006340 return -1;
6341 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006342
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006343 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006344
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006345 Py_DECREF(str);
6346 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006347
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006348 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006349}
6350
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351/* Concat to string or Unicode object giving a new Unicode object. */
6352
6353PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006354 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355{
6356 PyUnicodeObject *u = NULL, *v = NULL, *w;
6357
6358 /* Coerce the two arguments */
6359 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6360 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006361 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6363 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006364 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365
6366 /* Shortcuts */
6367 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006368 Py_DECREF(v);
6369 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 }
6371 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006372 Py_DECREF(u);
6373 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 }
6375
Serhiy Storchaka373773d2016-07-12 15:46:57 +03006376 if (u->length > PY_SSIZE_T_MAX - v->length) {
6377 PyErr_SetString(PyExc_OverflowError,
6378 "strings are too large to concat");
6379 goto onError;
6380 }
6381
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382 /* Concat the two Unicode strings */
6383 w = _PyUnicode_New(u->length + v->length);
6384 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006385 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 Py_UNICODE_COPY(w->str, u->str, u->length);
6387 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6388
6389 Py_DECREF(u);
6390 Py_DECREF(v);
6391 return (PyObject *)w;
6392
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006393 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 Py_XDECREF(u);
6395 Py_XDECREF(v);
6396 return NULL;
6397}
6398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006399PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006400 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006402Return the number of non-overlapping occurrences of substring sub in\n\
6403Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006404interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
6406static PyObject *
6407unicode_count(PyUnicodeObject *self, PyObject *args)
6408{
6409 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006410 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006411 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 PyObject *result;
6413
Jesus Cea44e81682011-04-20 16:39:15 +02006414 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6415 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006416 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006417
Antoine Pitrou64672132010-01-13 07:55:48 +00006418 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006419 result = PyInt_FromSsize_t(
6420 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006421 substring->str, substring->length,
6422 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006423 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424
6425 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006426
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427 return result;
6428}
6429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006430PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006431 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006433Encodes S using the codec registered for encoding. encoding defaults\n\
6434to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006435handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6437'xmlcharrefreplace' as well as any other name registered with\n\
6438codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439
6440static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006441unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006443 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 char *encoding = NULL;
6445 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006446 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006447
Benjamin Peterson332d7212009-09-18 21:14:55 +00006448 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6449 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006451 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006452 if (v == NULL)
6453 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006454 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006455 PyErr_Format(PyExc_TypeError,
6456 "encoder did not return a string/unicode object "
6457 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006458 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006459 Py_DECREF(v);
6460 return NULL;
6461 }
6462 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006463
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006464 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006465 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006466}
6467
6468PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006469 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006470\n\
6471Decodes S using the codec registered for encoding. encoding defaults\n\
6472to the default encoding. errors may be given to set a different error\n\
6473handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6474a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006475as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006476able to handle UnicodeDecodeErrors.");
6477
6478static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006479unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006480{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006481 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006482 char *encoding = NULL;
6483 char *errors = NULL;
6484 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006485
Benjamin Peterson332d7212009-09-18 21:14:55 +00006486 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6487 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006488 return NULL;
6489 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006490 if (v == NULL)
6491 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006492 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006493 PyErr_Format(PyExc_TypeError,
6494 "decoder did not return a string/unicode object "
6495 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006496 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006497 Py_DECREF(v);
6498 return NULL;
6499 }
6500 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006501
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006502 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504}
6505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006506PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006507 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508\n\
6509Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006510If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
6512static PyObject*
6513unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6514{
6515 Py_UNICODE *e;
6516 Py_UNICODE *p;
6517 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006518 Py_UNICODE *qe;
6519 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520 PyUnicodeObject *u;
6521 int tabsize = 8;
6522
6523 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525
Thomas Wouters7e474022000-07-16 12:04:32 +00006526 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006527 i = 0; /* chars up to and including most recent \n or \r */
6528 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6529 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 for (p = self->str; p < e; p++)
6531 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006532 if (tabsize > 0) {
6533 incr = tabsize - (j % tabsize); /* cannot overflow */
6534 if (j > PY_SSIZE_T_MAX - incr)
6535 goto overflow1;
6536 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006537 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006540 if (j > PY_SSIZE_T_MAX - 1)
6541 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 j++;
6543 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006544 if (i > PY_SSIZE_T_MAX - j)
6545 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006547 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 }
6549 }
6550
Guido van Rossum5bdff602008-03-11 21:18:06 +00006551 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006552 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006553
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 /* Second pass: create output string and fill it */
6555 u = _PyUnicode_New(i + j);
6556 if (!u)
6557 return NULL;
6558
Guido van Rossum5bdff602008-03-11 21:18:06 +00006559 j = 0; /* same as in first pass */
6560 q = u->str; /* next output char */
6561 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562
6563 for (p = self->str; p < e; p++)
6564 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006565 if (tabsize > 0) {
6566 i = tabsize - (j % tabsize);
6567 j += i;
6568 while (i--) {
6569 if (q >= qe)
6570 goto overflow2;
6571 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006572 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006573 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006574 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006575 else {
6576 if (q >= qe)
6577 goto overflow2;
6578 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006579 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 if (*p == '\n' || *p == '\r')
6581 j = 0;
6582 }
6583
6584 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006585
6586 overflow2:
6587 Py_DECREF(u);
6588 overflow1:
6589 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591}
6592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006593PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006594 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595\n\
6596Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006597such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598arguments start and end are interpreted as in slice notation.\n\
6599\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006600Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
6602static PyObject *
6603unicode_find(PyUnicodeObject *self, PyObject *args)
6604{
Jesus Cea44e81682011-04-20 16:39:15 +02006605 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006606 Py_ssize_t start;
6607 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006608 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609
Jesus Cea44e81682011-04-20 16:39:15 +02006610 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6611 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006614 result = stringlib_find_slice(
6615 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6616 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6617 start, end
6618 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619
6620 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006621
6622 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623}
6624
6625static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006626unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627{
6628 if (index < 0 || index >= self->length) {
6629 PyErr_SetString(PyExc_IndexError, "string index out of range");
6630 return NULL;
6631 }
6632
6633 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6634}
6635
6636static long
6637unicode_hash(PyUnicodeObject *self)
6638{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006639 /* Since Unicode objects compare equal to their ASCII string
6640 counterparts, they should use the individual character values
6641 as basis for their hash value. This is needed to assure that
6642 strings and Unicode objects behave in the same way as
6643 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644
Martin v. Löwis18e16552006-02-15 17:27:45 +00006645 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006646 register Py_UNICODE *p;
6647 register long x;
6648
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006649#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006650 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006651#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006653 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006654 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006655 /*
6656 We make the hash of the empty string be 0, rather than using
6657 (prefix ^ suffix), since this slightly obfuscates the hash secret
6658 */
6659 if (len == 0) {
6660 self->hash = 0;
6661 return 0;
6662 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006663 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006664 x = _Py_HashSecret.prefix;
6665 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006666 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006667 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006668 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006669 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006670 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006671 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006672 self->hash = x;
6673 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674}
6675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006676PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006677 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006679Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680
6681static PyObject *
6682unicode_index(PyUnicodeObject *self, PyObject *args)
6683{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006684 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006685 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006686 Py_ssize_t start;
6687 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
Jesus Cea44e81682011-04-20 16:39:15 +02006689 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6690 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006693 result = stringlib_find_slice(
6694 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6695 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6696 start, end
6697 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698
6699 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 if (result < 0) {
6702 PyErr_SetString(PyExc_ValueError, "substring not found");
6703 return NULL;
6704 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006705
Martin v. Löwis18e16552006-02-15 17:27:45 +00006706 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707}
6708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006709PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006710 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006712Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006713at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714
6715static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006716unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717{
6718 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6719 register const Py_UNICODE *e;
6720 int cased;
6721
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 /* Shortcut for single character strings */
6723 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006724 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006726 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006727 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006728 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006729
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 e = p + PyUnicode_GET_SIZE(self);
6731 cased = 0;
6732 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006733 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006734
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006735 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6736 return PyBool_FromLong(0);
6737 else if (!cased && Py_UNICODE_ISLOWER(ch))
6738 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006740 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741}
6742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006743PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006744 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006746Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006747at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748
6749static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006750unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751{
6752 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6753 register const Py_UNICODE *e;
6754 int cased;
6755
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 /* Shortcut for single character strings */
6757 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006758 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006760 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006761 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006762 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006763
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 e = p + PyUnicode_GET_SIZE(self);
6765 cased = 0;
6766 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006767 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006768
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006769 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6770 return PyBool_FromLong(0);
6771 else if (!cased && Py_UNICODE_ISUPPER(ch))
6772 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006774 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775}
6776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006777PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006778 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006780Return True if S is a titlecased string and there is at least one\n\
6781character in S, i.e. upper- and titlecase characters may only\n\
6782follow uncased characters and lowercase characters only cased ones.\n\
6783Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784
6785static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006786unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787{
6788 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6789 register const Py_UNICODE *e;
6790 int cased, previous_is_cased;
6791
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 /* Shortcut for single character strings */
6793 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006794 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6795 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006797 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006798 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006799 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006800
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801 e = p + PyUnicode_GET_SIZE(self);
6802 cased = 0;
6803 previous_is_cased = 0;
6804 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006805 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006806
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006807 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6808 if (previous_is_cased)
6809 return PyBool_FromLong(0);
6810 previous_is_cased = 1;
6811 cased = 1;
6812 }
6813 else if (Py_UNICODE_ISLOWER(ch)) {
6814 if (!previous_is_cased)
6815 return PyBool_FromLong(0);
6816 previous_is_cased = 1;
6817 cased = 1;
6818 }
6819 else
6820 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006825PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006826 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006828Return True if all characters in S are whitespace\n\
6829and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830
6831static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006832unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833{
6834 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6835 register const Py_UNICODE *e;
6836
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 /* Shortcut for single character strings */
6838 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006839 Py_UNICODE_ISSPACE(*p))
6840 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006842 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006843 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006844 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006845
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 e = p + PyUnicode_GET_SIZE(self);
6847 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006848 if (!Py_UNICODE_ISSPACE(*p))
6849 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006851 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852}
6853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006854PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006855 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006856\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006857Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006858and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006859
6860static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006861unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006862{
6863 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6864 register const Py_UNICODE *e;
6865
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006866 /* Shortcut for single character strings */
6867 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006868 Py_UNICODE_ISALPHA(*p))
6869 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006870
6871 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006872 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006873 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006874
6875 e = p + PyUnicode_GET_SIZE(self);
6876 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006877 if (!Py_UNICODE_ISALPHA(*p))
6878 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006879 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006880 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006881}
6882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006883PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006884 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006885\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006886Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006887and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006888
6889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006890unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006891{
6892 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6893 register const Py_UNICODE *e;
6894
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006895 /* Shortcut for single character strings */
6896 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006897 Py_UNICODE_ISALNUM(*p))
6898 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006899
6900 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006901 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006902 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006903
6904 e = p + PyUnicode_GET_SIZE(self);
6905 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006906 if (!Py_UNICODE_ISALNUM(*p))
6907 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006908 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006909 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006910}
6911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006912PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006913 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006915Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006916False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917
6918static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006919unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920{
6921 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6922 register const Py_UNICODE *e;
6923
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 /* Shortcut for single character strings */
6925 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006926 Py_UNICODE_ISDECIMAL(*p))
6927 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006929 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006930 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006931 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006932
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 e = p + PyUnicode_GET_SIZE(self);
6934 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006935 if (!Py_UNICODE_ISDECIMAL(*p))
6936 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939}
6940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006941PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006942 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006944Return True if all characters in S are digits\n\
6945and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
6947static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006948unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949{
6950 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6951 register const Py_UNICODE *e;
6952
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953 /* Shortcut for single character strings */
6954 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006955 Py_UNICODE_ISDIGIT(*p))
6956 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006958 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006959 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006960 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006961
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 e = p + PyUnicode_GET_SIZE(self);
6963 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006964 if (!Py_UNICODE_ISDIGIT(*p))
6965 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006967 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968}
6969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006970PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006971 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006973Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006974False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975
6976static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006977unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978{
6979 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6980 register const Py_UNICODE *e;
6981
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 /* Shortcut for single character strings */
6983 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006984 Py_UNICODE_ISNUMERIC(*p))
6985 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006987 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006988 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006989 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006990
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991 e = p + PyUnicode_GET_SIZE(self);
6992 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006993 if (!Py_UNICODE_ISNUMERIC(*p))
6994 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006996 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997}
6998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006999PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00007000 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001\n\
7002Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00007003iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004
7005static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007006unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007008 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009}
7010
Martin v. Löwis18e16552006-02-15 17:27:45 +00007011static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012unicode_length(PyUnicodeObject *self)
7013{
7014 return self->length;
7015}
7016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007017PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007018 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007020Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007021done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022
7023static PyObject *
7024unicode_ljust(PyUnicodeObject *self, PyObject *args)
7025{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007026 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007027 Py_UNICODE fillchar = ' ';
7028
Martin v. Löwis412fb672006-04-13 06:34:32 +00007029 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 return NULL;
7031
Tim Peters7a29bd52001-09-12 03:03:31 +00007032 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 Py_INCREF(self);
7034 return (PyObject*) self;
7035 }
7036
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007037 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038}
7039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007040PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007041 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007043Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044
7045static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007046unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 return fixup(self, fixlower);
7049}
7050
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007051#define LEFTSTRIP 0
7052#define RIGHTSTRIP 1
7053#define BOTHSTRIP 2
7054
7055/* Arrays indexed by above */
7056static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7057
7058#define STRIPNAME(i) (stripformat[i]+3)
7059
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007060/* externally visible for str.strip(unicode) */
7061PyObject *
7062_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7063{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007064 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7065 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7066 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7067 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7068 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007069
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007070 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007071
Benjamin Peterson857ce152009-01-31 16:29:18 +00007072 i = 0;
7073 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007074 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7075 i++;
7076 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007077 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007078
Benjamin Peterson857ce152009-01-31 16:29:18 +00007079 j = len;
7080 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007081 do {
7082 j--;
7083 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7084 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007085 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086
Benjamin Peterson857ce152009-01-31 16:29:18 +00007087 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007088 Py_INCREF(self);
7089 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007090 }
7091 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007092 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007093}
7094
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095
7096static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007097do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007099 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7100 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007101
Benjamin Peterson857ce152009-01-31 16:29:18 +00007102 i = 0;
7103 if (striptype != RIGHTSTRIP) {
7104 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7105 i++;
7106 }
7107 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007108
Benjamin Peterson857ce152009-01-31 16:29:18 +00007109 j = len;
7110 if (striptype != LEFTSTRIP) {
7111 do {
7112 j--;
7113 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7114 j++;
7115 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007116
Benjamin Peterson857ce152009-01-31 16:29:18 +00007117 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7118 Py_INCREF(self);
7119 return (PyObject*)self;
7120 }
7121 else
7122 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123}
7124
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007125
7126static PyObject *
7127do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7128{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007129 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007130
Benjamin Peterson857ce152009-01-31 16:29:18 +00007131 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7132 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007133
Benjamin Peterson857ce152009-01-31 16:29:18 +00007134 if (sep != NULL && sep != Py_None) {
7135 if (PyUnicode_Check(sep))
7136 return _PyUnicode_XStrip(self, striptype, sep);
7137 else if (PyString_Check(sep)) {
7138 PyObject *res;
7139 sep = PyUnicode_FromObject(sep);
7140 if (sep==NULL)
7141 return NULL;
7142 res = _PyUnicode_XStrip(self, striptype, sep);
7143 Py_DECREF(sep);
7144 return res;
7145 }
7146 else {
7147 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007148 "%s arg must be None, unicode or str",
7149 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007150 return NULL;
7151 }
7152 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007153
Benjamin Peterson857ce152009-01-31 16:29:18 +00007154 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007155}
7156
7157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007158PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007159 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007160\n\
7161Return a copy of the string S with leading and trailing\n\
7162whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007163If chars is given and not None, remove characters in chars instead.\n\
7164If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007165
7166static PyObject *
7167unicode_strip(PyUnicodeObject *self, PyObject *args)
7168{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007169 if (PyTuple_GET_SIZE(args) == 0)
7170 return do_strip(self, BOTHSTRIP); /* Common case */
7171 else
7172 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007173}
7174
7175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007176PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007177 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007178\n\
7179Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007180If chars is given and not None, remove characters in chars instead.\n\
7181If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007182
7183static PyObject *
7184unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7185{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007186 if (PyTuple_GET_SIZE(args) == 0)
7187 return do_strip(self, LEFTSTRIP); /* Common case */
7188 else
7189 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007190}
7191
7192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007193PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007194 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007195\n\
7196Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007197If chars is given and not None, remove characters in chars instead.\n\
7198If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007199
7200static PyObject *
7201unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7202{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007203 if (PyTuple_GET_SIZE(args) == 0)
7204 return do_strip(self, RIGHTSTRIP); /* Common case */
7205 else
7206 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007207}
7208
7209
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007211unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212{
7213 PyUnicodeObject *u;
7214 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007215 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007216 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217
7218 if (len < 0)
7219 len = 0;
7220
Tim Peters7a29bd52001-09-12 03:03:31 +00007221 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 /* no repeat, return original string */
7223 Py_INCREF(str);
7224 return (PyObject*) str;
7225 }
Tim Peters8f422462000-09-09 06:13:41 +00007226
Serhiy Storchaka373773d2016-07-12 15:46:57 +03007227 /* ensure # of chars needed doesn't overflow Py_ssize_t and # of bytes
Tim Peters8f422462000-09-09 06:13:41 +00007228 * needed doesn't overflow size_t
7229 */
Serhiy Storchaka373773d2016-07-12 15:46:57 +03007230 if (len && str->length > PY_SSIZE_T_MAX / len) {
Tim Peters8f422462000-09-09 06:13:41 +00007231 PyErr_SetString(PyExc_OverflowError,
7232 "repeated string is too long");
7233 return NULL;
7234 }
Serhiy Storchaka373773d2016-07-12 15:46:57 +03007235 nchars = len * str->length;
7236 nbytes = ((size_t)nchars + 1u) * sizeof(Py_UNICODE);
7237 if (nbytes / sizeof(Py_UNICODE) != ((size_t)nchars + 1u)) {
Tim Peters8f422462000-09-09 06:13:41 +00007238 PyErr_SetString(PyExc_OverflowError,
7239 "repeated string is too long");
7240 return NULL;
7241 }
7242 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243 if (!u)
7244 return NULL;
7245
7246 p = u->str;
7247
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007248 if (str->length == 1 && len > 0) {
7249 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007250 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007251 Py_ssize_t done = 0; /* number of characters copied this far */
7252 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007253 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007254 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007255 }
7256 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007257 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007258 Py_UNICODE_COPY(p+done, p, n);
7259 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007260 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262
7263 return (PyObject*) u;
7264}
7265
7266PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007267 PyObject *subobj,
7268 PyObject *replobj,
7269 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270{
7271 PyObject *self;
7272 PyObject *str1;
7273 PyObject *str2;
7274 PyObject *result;
7275
7276 self = PyUnicode_FromObject(obj);
7277 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 str1 = PyUnicode_FromObject(subobj);
7280 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007281 Py_DECREF(self);
7282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 }
7284 str2 = PyUnicode_FromObject(replobj);
7285 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007286 Py_DECREF(self);
7287 Py_DECREF(str1);
7288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289 }
Tim Petersced69f82003-09-16 20:30:58 +00007290 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007291 (PyUnicodeObject *)str1,
7292 (PyUnicodeObject *)str2,
7293 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294 Py_DECREF(self);
7295 Py_DECREF(str1);
7296 Py_DECREF(str2);
7297 return result;
7298}
7299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007300PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007301 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302\n\
7303Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007304old replaced by new. If the optional argument count is\n\
7305given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306
7307static PyObject*
7308unicode_replace(PyUnicodeObject *self, PyObject *args)
7309{
7310 PyUnicodeObject *str1;
7311 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007312 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 PyObject *result;
7314
Martin v. Löwis18e16552006-02-15 17:27:45 +00007315 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316 return NULL;
7317 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7318 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007321 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007322 Py_DECREF(str1);
7323 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325
7326 result = replace(self, str1, str2, maxcount);
7327
7328 Py_DECREF(str1);
7329 Py_DECREF(str2);
7330 return result;
7331}
7332
7333static
7334PyObject *unicode_repr(PyObject *unicode)
7335{
7336 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007337 PyUnicode_GET_SIZE(unicode),
7338 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339}
7340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007341PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007342 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343\n\
7344Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007345such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346arguments start and end are interpreted as in slice notation.\n\
7347\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007348Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349
7350static PyObject *
7351unicode_rfind(PyUnicodeObject *self, PyObject *args)
7352{
Jesus Cea44e81682011-04-20 16:39:15 +02007353 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007354 Py_ssize_t start;
7355 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007356 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357
Jesus Cea44e81682011-04-20 16:39:15 +02007358 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7359 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007362 result = stringlib_rfind_slice(
7363 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7364 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7365 start, end
7366 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
7368 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007369
7370 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371}
7372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007373PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007374 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007376Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377
7378static PyObject *
7379unicode_rindex(PyUnicodeObject *self, PyObject *args)
7380{
Jesus Cea44e81682011-04-20 16:39:15 +02007381 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007382 Py_ssize_t start;
7383 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007384 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385
Jesus Cea44e81682011-04-20 16:39:15 +02007386 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7387 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007388 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007390 result = stringlib_rfind_slice(
7391 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7392 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7393 start, end
7394 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395
7396 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007397
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398 if (result < 0) {
7399 PyErr_SetString(PyExc_ValueError, "substring not found");
7400 return NULL;
7401 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007402 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403}
7404
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007405PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007406 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007408Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007409done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410
7411static PyObject *
7412unicode_rjust(PyUnicodeObject *self, PyObject *args)
7413{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007414 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007415 Py_UNICODE fillchar = ' ';
7416
Martin v. Löwis412fb672006-04-13 06:34:32 +00007417 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 return NULL;
7419
Tim Peters7a29bd52001-09-12 03:03:31 +00007420 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421 Py_INCREF(self);
7422 return (PyObject*) self;
7423 }
7424
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007425 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426}
7427
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007429unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430{
7431 /* standard clamping */
7432 if (start < 0)
7433 start = 0;
7434 if (end < 0)
7435 end = 0;
7436 if (end > self->length)
7437 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007438 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 /* full slice, return original string */
7440 Py_INCREF(self);
7441 return (PyObject*) self;
7442 }
7443 if (start > end)
7444 start = end;
7445 /* copy slice */
7446 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007447 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448}
7449
7450PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007451 PyObject *sep,
7452 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453{
7454 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007455
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 s = PyUnicode_FromObject(s);
7457 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007458 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007459 if (sep != NULL) {
7460 sep = PyUnicode_FromObject(sep);
7461 if (sep == NULL) {
7462 Py_DECREF(s);
7463 return NULL;
7464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465 }
7466
7467 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7468
7469 Py_DECREF(s);
7470 Py_XDECREF(sep);
7471 return result;
7472}
7473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007474PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007475 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476\n\
7477Return a list of the words in S, using sep as the\n\
7478delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007479splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007480whitespace string is a separator and empty strings are\n\
7481removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482
7483static PyObject*
7484unicode_split(PyUnicodeObject *self, PyObject *args)
7485{
7486 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007487 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488
Martin v. Löwis18e16552006-02-15 17:27:45 +00007489 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 return NULL;
7491
7492 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007493 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007495 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007497 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498}
7499
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007500PyObject *
7501PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7502{
7503 PyObject* str_obj;
7504 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007505 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007506
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007507 str_obj = PyUnicode_FromObject(str_in);
7508 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007509 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007510 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007511 if (!sep_obj) {
7512 Py_DECREF(str_obj);
7513 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007514 }
7515
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007516 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007517 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7518 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7519 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007520
Fredrik Lundhb9479482006-05-26 17:22:38 +00007521 Py_DECREF(sep_obj);
7522 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007523
7524 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007525}
7526
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007527
7528PyObject *
7529PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7530{
7531 PyObject* str_obj;
7532 PyObject* sep_obj;
7533 PyObject* out;
7534
7535 str_obj = PyUnicode_FromObject(str_in);
7536 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007537 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007538 sep_obj = PyUnicode_FromObject(sep_in);
7539 if (!sep_obj) {
7540 Py_DECREF(str_obj);
7541 return NULL;
7542 }
7543
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007544 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007545 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7546 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7547 );
7548
7549 Py_DECREF(sep_obj);
7550 Py_DECREF(str_obj);
7551
7552 return out;
7553}
7554
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007555PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007556 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007557\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007558Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007559the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007560found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007561
7562static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007563unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007564{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007565 return PyUnicode_Partition((PyObject *)self, separator);
7566}
7567
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007568PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007569 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007570\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007571Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007572the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007573separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007574
7575static PyObject*
7576unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7577{
7578 return PyUnicode_RPartition((PyObject *)self, separator);
7579}
7580
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007581PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007582 PyObject *sep,
7583 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007584{
7585 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007586
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007587 s = PyUnicode_FromObject(s);
7588 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007589 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007590 if (sep != NULL) {
7591 sep = PyUnicode_FromObject(sep);
7592 if (sep == NULL) {
7593 Py_DECREF(s);
7594 return NULL;
7595 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007596 }
7597
7598 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7599
7600 Py_DECREF(s);
7601 Py_XDECREF(sep);
7602 return result;
7603}
7604
7605PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007606 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007607\n\
7608Return a list of the words in S, using sep as the\n\
7609delimiter string, starting at the end of the string and\n\
7610working to the front. If maxsplit is given, at most maxsplit\n\
7611splits are done. If sep is not specified, any whitespace string\n\
7612is a separator.");
7613
7614static PyObject*
7615unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7616{
7617 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007618 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007619
Martin v. Löwis18e16552006-02-15 17:27:45 +00007620 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007621 return NULL;
7622
7623 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007624 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007625 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007626 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007627 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007628 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007629}
7630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007631PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007632 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633\n\
7634Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007635Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007636is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637
7638static PyObject*
7639unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7640{
Guido van Rossum86662912000-04-11 15:38:46 +00007641 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642
Guido van Rossum86662912000-04-11 15:38:46 +00007643 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 return NULL;
7645
Guido van Rossum86662912000-04-11 15:38:46 +00007646 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647}
7648
7649static
7650PyObject *unicode_str(PyUnicodeObject *self)
7651{
Fred Drakee4315f52000-05-09 19:53:39 +00007652 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653}
7654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007655PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007656 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657\n\
7658Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007659and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
7661static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007662unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664 return fixup(self, fixswapcase);
7665}
7666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007667PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007668 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669\n\
7670Return a copy of the string S, where all characters have been mapped\n\
7671through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007672Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7673Unmapped characters are left untouched. Characters mapped to None\n\
7674are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
7676static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007677unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678{
Tim Petersced69f82003-09-16 20:30:58 +00007679 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007680 self->length,
7681 table,
7682 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683}
7684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007685PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007686 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007688Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689
7690static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007691unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 return fixup(self, fixupper);
7694}
7695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007696PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007697 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698\n\
Georg Brandl98064072008-09-09 19:26:00 +00007699Pad a numeric string S with zeros on the left, to fill a field\n\
7700of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701
7702static PyObject *
7703unicode_zfill(PyUnicodeObject *self, PyObject *args)
7704{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007705 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706 PyUnicodeObject *u;
7707
Martin v. Löwis18e16552006-02-15 17:27:45 +00007708 Py_ssize_t width;
7709 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 return NULL;
7711
7712 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007713 if (PyUnicode_CheckExact(self)) {
7714 Py_INCREF(self);
7715 return (PyObject*) self;
7716 }
7717 else
7718 return PyUnicode_FromUnicode(
7719 PyUnicode_AS_UNICODE(self),
7720 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007721 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 }
7723
7724 fill = width - self->length;
7725
7726 u = pad(self, fill, 0, '0');
7727
Walter Dörwald068325e2002-04-15 13:36:47 +00007728 if (u == NULL)
7729 return NULL;
7730
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 if (u->str[fill] == '+' || u->str[fill] == '-') {
7732 /* move sign to beginning of string */
7733 u->str[0] = u->str[fill];
7734 u->str[fill] = '0';
7735 }
7736
7737 return (PyObject*) u;
7738}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739
7740#if 0
7741static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007742free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007744 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745}
7746#endif
7747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007748PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007749 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007751Return True if S starts with the specified prefix, False otherwise.\n\
7752With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007753With optional end, stop comparing S at that position.\n\
7754prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
7756static PyObject *
7757unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007758 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759{
Georg Brandl24250812006-06-09 18:45:48 +00007760 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007762 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007763 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007764 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765
Jesus Cea44e81682011-04-20 16:39:15 +02007766 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007767 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007768 if (PyTuple_Check(subobj)) {
7769 Py_ssize_t i;
7770 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7771 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007772 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007773 if (substring == NULL)
7774 return NULL;
7775 result = tailmatch(self, substring, start, end, -1);
7776 Py_DECREF(substring);
7777 if (result) {
7778 Py_RETURN_TRUE;
7779 }
7780 }
7781 /* nothing matched */
7782 Py_RETURN_FALSE;
7783 }
7784 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007785 if (substring == NULL) {
7786 if (PyErr_ExceptionMatches(PyExc_TypeError))
7787 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7788 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007789 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007790 }
Georg Brandl24250812006-06-09 18:45:48 +00007791 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007793 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794}
7795
7796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007797PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007798 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007800Return True if S ends with the specified suffix, False otherwise.\n\
7801With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007802With optional end, stop comparing S at that position.\n\
7803suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
7805static PyObject *
7806unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007807 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808{
Georg Brandl24250812006-06-09 18:45:48 +00007809 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007811 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007812 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007813 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814
Jesus Cea44e81682011-04-20 16:39:15 +02007815 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007816 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007817 if (PyTuple_Check(subobj)) {
7818 Py_ssize_t i;
7819 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7820 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007821 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007822 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007823 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007824 result = tailmatch(self, substring, start, end, +1);
7825 Py_DECREF(substring);
7826 if (result) {
7827 Py_RETURN_TRUE;
7828 }
7829 }
7830 Py_RETURN_FALSE;
7831 }
7832 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007833 if (substring == NULL) {
7834 if (PyErr_ExceptionMatches(PyExc_TypeError))
7835 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7836 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007837 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007838 }
Georg Brandl24250812006-06-09 18:45:48 +00007839 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007841 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842}
7843
7844
Eric Smitha9f7d622008-02-17 19:46:49 +00007845/* Implements do_string_format, which is unicode because of stringlib */
7846#include "stringlib/string_format.h"
7847
7848PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007849 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007850\n\
Eric Smith6c840852010-11-06 19:43:44 +00007851Return a formatted version of S, using substitutions from args and kwargs.\n\
7852The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007853
Eric Smithdc13b792008-05-30 18:10:04 +00007854static PyObject *
7855unicode__format__(PyObject *self, PyObject *args)
7856{
7857 PyObject *format_spec;
7858 PyObject *result = NULL;
7859 PyObject *tmp = NULL;
7860
7861 /* If 2.x, convert format_spec to the same type as value */
7862 /* This is to allow things like u''.format('') */
7863 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7864 goto done;
7865 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7866 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007867 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007868 goto done;
7869 }
7870 tmp = PyObject_Unicode(format_spec);
7871 if (tmp == NULL)
7872 goto done;
7873 format_spec = tmp;
7874
7875 result = _PyUnicode_FormatAdvanced(self,
7876 PyUnicode_AS_UNICODE(format_spec),
7877 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007878 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007879 Py_XDECREF(tmp);
7880 return result;
7881}
7882
Eric Smitha9f7d622008-02-17 19:46:49 +00007883PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007884 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007885\n\
Eric Smith6c840852010-11-06 19:43:44 +00007886Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007887
Robert Schuppenies901c9972008-06-10 10:10:31 +00007888static PyObject *
7889unicode__sizeof__(PyUnicodeObject *v)
7890{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007891 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7892 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007893}
7894
7895PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007896 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007897\n\
7898");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007899
7900static PyObject *
7901unicode_getnewargs(PyUnicodeObject *v)
7902{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007903 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007904}
7905
7906
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007908 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007909 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7910 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007911 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007912 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7913 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7914 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7915 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7916 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7917 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7918 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007919 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007920 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7921 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7922 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007923 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007924 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007925/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7926 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7927 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7928 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007929 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007930 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007931 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007932 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007933 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7934 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7935 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7936 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7937 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7938 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7939 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7940 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7941 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7942 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7943 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7944 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7945 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7946 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007947 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007948 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7949 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7950 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7951 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007952 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007953#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007954 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955#endif
7956
7957#if 0
7958 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007959 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960#endif
7961
Benjamin Peterson857ce152009-01-31 16:29:18 +00007962 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 {NULL, NULL}
7964};
7965
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007966static PyObject *
7967unicode_mod(PyObject *v, PyObject *w)
7968{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007969 if (!PyUnicode_Check(v)) {
7970 Py_INCREF(Py_NotImplemented);
7971 return Py_NotImplemented;
7972 }
7973 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007974}
7975
7976static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007977 0, /*nb_add*/
7978 0, /*nb_subtract*/
7979 0, /*nb_multiply*/
7980 0, /*nb_divide*/
7981 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007982};
7983
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007985 (lenfunc) unicode_length, /* sq_length */
7986 PyUnicode_Concat, /* sq_concat */
7987 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7988 (ssizeargfunc) unicode_getitem, /* sq_item */
7989 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7990 0, /* sq_ass_item */
7991 0, /* sq_ass_slice */
7992 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993};
7994
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007995static PyObject*
7996unicode_subscript(PyUnicodeObject* self, PyObject* item)
7997{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007998 if (PyIndex_Check(item)) {
7999 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008000 if (i == -1 && PyErr_Occurred())
8001 return NULL;
8002 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008003 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008004 return unicode_getitem(self, i);
8005 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008006 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008007 Py_UNICODE* source_buf;
8008 Py_UNICODE* result_buf;
8009 PyObject* result;
8010
Serhiy Storchaka5e793212017-04-15 20:11:12 +03008011 if (_PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008012 return NULL;
8013 }
Serhiy Storchakae41390a2017-04-08 11:48:57 +03008014 slicelength = _PySlice_AdjustIndices(PyUnicode_GET_SIZE(self), &start,
8015 &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008016
8017 if (slicelength <= 0) {
8018 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008019 } else if (start == 0 && step == 1 && slicelength == self->length &&
8020 PyUnicode_CheckExact(self)) {
8021 Py_INCREF(self);
8022 return (PyObject *)self;
8023 } else if (step == 1) {
8024 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008025 } else {
8026 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008027 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8028 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008029
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008030 if (result_buf == NULL)
8031 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008032
8033 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8034 result_buf[i] = source_buf[cur];
8035 }
Tim Petersced69f82003-09-16 20:30:58 +00008036
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008037 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008038 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008039 return result;
8040 }
8041 } else {
8042 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8043 return NULL;
8044 }
8045}
8046
8047static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008048 (lenfunc)unicode_length, /* mp_length */
8049 (binaryfunc)unicode_subscript, /* mp_subscript */
8050 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008051};
8052
Martin v. Löwis18e16552006-02-15 17:27:45 +00008053static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008055 Py_ssize_t index,
8056 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057{
8058 if (index != 0) {
8059 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008060 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061 return -1;
8062 }
8063 *ptr = (void *) self->str;
8064 return PyUnicode_GET_DATA_SIZE(self);
8065}
8066
Martin v. Löwis18e16552006-02-15 17:27:45 +00008067static Py_ssize_t
8068unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008069 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070{
8071 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008072 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073 return -1;
8074}
8075
8076static int
8077unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008078 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079{
8080 if (lenp)
8081 *lenp = PyUnicode_GET_DATA_SIZE(self);
8082 return 1;
8083}
8084
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008085static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008087 Py_ssize_t index,
8088 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089{
8090 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008091
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092 if (index != 0) {
8093 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008094 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095 return -1;
8096 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008097 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008099 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008100 *ptr = (void *) PyString_AS_STRING(str);
8101 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102}
8103
8104/* Helpers for PyUnicode_Format() */
8105
8106static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008107getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008109 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008111 (*p_argidx)++;
8112 if (arglen < 0)
8113 return args;
8114 else
8115 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116 }
8117 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008118 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 return NULL;
8120}
8121
8122#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008123#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008125#define F_ALT (1<<3)
8126#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127
Martin v. Löwis18e16552006-02-15 17:27:45 +00008128static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008129strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008131 register Py_ssize_t i;
8132 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008134 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136 return len;
8137}
8138
Neal Norwitzfc76d632006-01-10 06:03:13 +00008139static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008140longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8141{
Tim Peters15231542006-02-16 01:08:01 +00008142 Py_ssize_t result;
8143
Neal Norwitzfc76d632006-01-10 06:03:13 +00008144 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008145 result = strtounicode(buffer, (char *)buffer);
8146 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008147}
8148
Guido van Rossum078151d2002-08-11 04:24:12 +00008149/* XXX To save some code duplication, formatfloat/long/int could have been
8150 shared with stringobject.c, converting from 8-bit to Unicode after the
8151 formatting is done. */
8152
Mark Dickinson18cfada2009-11-23 18:46:41 +00008153/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8154
8155static PyObject *
8156formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008158 char *p;
8159 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008161
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 x = PyFloat_AsDouble(v);
8163 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008164 return NULL;
8165
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008167 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008168
Mark Dickinson18cfada2009-11-23 18:46:41 +00008169 p = PyOS_double_to_string(x, type, prec,
8170 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8171 if (p == NULL)
8172 return NULL;
8173 result = PyUnicode_FromStringAndSize(p, strlen(p));
8174 PyMem_Free(p);
8175 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176}
8177
Tim Peters38fd5b62000-09-21 05:43:11 +00008178static PyObject*
8179formatlong(PyObject *val, int flags, int prec, int type)
8180{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008181 char *buf;
8182 int i, len;
8183 PyObject *str; /* temporary string object. */
8184 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008185
Benjamin Peterson857ce152009-01-31 16:29:18 +00008186 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8187 if (!str)
8188 return NULL;
8189 result = _PyUnicode_New(len);
8190 if (!result) {
8191 Py_DECREF(str);
8192 return NULL;
8193 }
8194 for (i = 0; i < len; i++)
8195 result->str[i] = buf[i];
8196 result->str[len] = 0;
8197 Py_DECREF(str);
8198 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008199}
8200
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201static int
8202formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008203 size_t buflen,
8204 int flags,
8205 int prec,
8206 int type,
8207 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008209 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008210 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8211 * + 1 + 1
8212 * = 24
8213 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008214 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008215 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 long x;
8217
8218 x = PyInt_AsLong(v);
8219 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008220 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008221 if (x < 0 && type == 'u') {
8222 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008223 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008224 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8225 sign = "-";
8226 else
8227 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008229 prec = 1;
8230
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008231 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8232 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008233 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008234 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008235 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008236 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008237 return -1;
8238 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008239
8240 if ((flags & F_ALT) &&
8241 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008242 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008243 * of issues that cause pain:
8244 * - when 0 is being converted, the C standard leaves off
8245 * the '0x' or '0X', which is inconsistent with other
8246 * %#x/%#X conversions and inconsistent with Python's
8247 * hex() function
8248 * - there are platforms that violate the standard and
8249 * convert 0 with the '0x' or '0X'
8250 * (Metrowerks, Compaq Tru64)
8251 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008252 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008253 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008254 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008255 * We can achieve the desired consistency by inserting our
8256 * own '0x' or '0X' prefix, and substituting %x/%X in place
8257 * of %#x/%#X.
8258 *
8259 * Note that this is the same approach as used in
8260 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008261 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008262 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8263 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008264 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008265 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008266 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8267 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008268 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008269 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008270 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008271 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008272 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008273 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274}
8275
8276static int
8277formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008278 size_t buflen,
8279 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280{
Ezio Melotti32125152010-02-25 17:36:04 +00008281 PyObject *unistr;
8282 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008283 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008284 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008285 if (PyUnicode_GET_SIZE(v) != 1)
8286 goto onError;
8287 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008290 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008291 if (PyString_GET_SIZE(v) != 1)
8292 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008293 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8294 with a UnicodeDecodeError if 'char' is not decodable with the
8295 default encoding (usually ASCII, but it might be something else) */
8296 str = PyString_AS_STRING(v);
8297 if ((unsigned char)str[0] > 0x7F) {
8298 /* the char is not ASCII; try to decode the string using the
8299 default encoding and return -1 to let the UnicodeDecodeError
8300 be raised if the string can't be decoded */
8301 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8302 if (unistr == NULL)
8303 return -1;
8304 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8305 Py_DECREF(unistr);
8306 }
8307 else
8308 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008309 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310
8311 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008312 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008314 x = PyInt_AsLong(v);
8315 if (x == -1 && PyErr_Occurred())
8316 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008317#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008318 if (x < 0 || x > 0x10ffff) {
8319 PyErr_SetString(PyExc_OverflowError,
8320 "%c arg not in range(0x110000) "
8321 "(wide Python build)");
8322 return -1;
8323 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008324#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008325 if (x < 0 || x > 0xffff) {
8326 PyErr_SetString(PyExc_OverflowError,
8327 "%c arg not in range(0x10000) "
8328 "(narrow Python build)");
8329 return -1;
8330 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008331#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008332 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 }
8334 buf[1] = '\0';
8335 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008336
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008337 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008338 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008339 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008340 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341}
8342
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008343/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8344
Mark Dickinson18cfada2009-11-23 18:46:41 +00008345 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008346 chars are formatted. XXX This is a magic number. Each formatting
8347 routine does bounds checking to ensure no overflow, but a better
8348 solution may be to malloc a buffer of appropriate size for each
8349 format. For now, the current solution is sufficient.
8350*/
8351#define FORMATBUFLEN (size_t)120
8352
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008354 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355{
8356 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008357 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 int args_owned = 0;
8359 PyUnicodeObject *result = NULL;
8360 PyObject *dict = NULL;
8361 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008362
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008364 PyErr_BadInternalCall();
8365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 }
8367 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008368 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008369 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 fmt = PyUnicode_AS_UNICODE(uformat);
8371 fmtcnt = PyUnicode_GET_SIZE(uformat);
8372
8373 reslen = rescnt = fmtcnt + 100;
8374 result = _PyUnicode_New(reslen);
8375 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377 res = PyUnicode_AS_UNICODE(result);
8378
8379 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008380 arglen = PyTuple_Size(args);
8381 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 }
8383 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008384 arglen = -1;
8385 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386 }
Benjamin Petersonda2c7eb2013-03-23 22:32:00 -05008387 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8388 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008389 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390
8391 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008392 if (*fmt != '%') {
8393 if (--rescnt < 0) {
8394 rescnt = fmtcnt + 100;
8395 reslen += rescnt;
8396 if (_PyUnicode_Resize(&result, reslen) < 0)
8397 goto onError;
8398 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8399 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008400 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008401 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008402 }
8403 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008404 /* Got a format specifier */
8405 int flags = 0;
8406 Py_ssize_t width = -1;
8407 int prec = -1;
8408 Py_UNICODE c = '\0';
8409 Py_UNICODE fill;
8410 int isnumok;
8411 PyObject *v = NULL;
8412 PyObject *temp = NULL;
8413 Py_UNICODE *pbuf;
8414 Py_UNICODE sign;
8415 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008416 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008417
8418 fmt++;
8419 if (*fmt == '(') {
8420 Py_UNICODE *keystart;
8421 Py_ssize_t keylen;
8422 PyObject *key;
8423 int pcount = 1;
8424
8425 if (dict == NULL) {
8426 PyErr_SetString(PyExc_TypeError,
8427 "format requires a mapping");
8428 goto onError;
8429 }
8430 ++fmt;
8431 --fmtcnt;
8432 keystart = fmt;
8433 /* Skip over balanced parentheses */
8434 while (pcount > 0 && --fmtcnt >= 0) {
8435 if (*fmt == ')')
8436 --pcount;
8437 else if (*fmt == '(')
8438 ++pcount;
8439 fmt++;
8440 }
8441 keylen = fmt - keystart - 1;
8442 if (fmtcnt < 0 || pcount > 0) {
8443 PyErr_SetString(PyExc_ValueError,
8444 "incomplete format key");
8445 goto onError;
8446 }
8447#if 0
8448 /* keys are converted to strings using UTF-8 and
8449 then looked up since Python uses strings to hold
8450 variables names etc. in its namespaces and we
8451 wouldn't want to break common idioms. */
8452 key = PyUnicode_EncodeUTF8(keystart,
8453 keylen,
8454 NULL);
8455#else
8456 key = PyUnicode_FromUnicode(keystart, keylen);
8457#endif
8458 if (key == NULL)
8459 goto onError;
8460 if (args_owned) {
8461 Py_DECREF(args);
8462 args_owned = 0;
8463 }
8464 args = PyObject_GetItem(dict, key);
8465 Py_DECREF(key);
8466 if (args == NULL) {
8467 goto onError;
8468 }
8469 args_owned = 1;
8470 arglen = -1;
8471 argidx = -2;
8472 }
8473 while (--fmtcnt >= 0) {
8474 switch (c = *fmt++) {
8475 case '-': flags |= F_LJUST; continue;
8476 case '+': flags |= F_SIGN; continue;
8477 case ' ': flags |= F_BLANK; continue;
8478 case '#': flags |= F_ALT; continue;
8479 case '0': flags |= F_ZERO; continue;
8480 }
8481 break;
8482 }
8483 if (c == '*') {
8484 v = getnextarg(args, arglen, &argidx);
8485 if (v == NULL)
8486 goto onError;
8487 if (!PyInt_Check(v)) {
8488 PyErr_SetString(PyExc_TypeError,
8489 "* wants int");
8490 goto onError;
8491 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008492 width = PyInt_AsSsize_t(v);
8493 if (width == -1 && PyErr_Occurred())
8494 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008495 if (width < 0) {
8496 flags |= F_LJUST;
8497 width = -width;
8498 }
8499 if (--fmtcnt >= 0)
8500 c = *fmt++;
8501 }
8502 else if (c >= '0' && c <= '9') {
8503 width = c - '0';
8504 while (--fmtcnt >= 0) {
8505 c = *fmt++;
8506 if (c < '0' || c > '9')
8507 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008508 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008509 PyErr_SetString(PyExc_ValueError,
8510 "width too big");
8511 goto onError;
8512 }
8513 width = width*10 + (c - '0');
8514 }
8515 }
8516 if (c == '.') {
8517 prec = 0;
8518 if (--fmtcnt >= 0)
8519 c = *fmt++;
8520 if (c == '*') {
8521 v = getnextarg(args, arglen, &argidx);
8522 if (v == NULL)
8523 goto onError;
8524 if (!PyInt_Check(v)) {
8525 PyErr_SetString(PyExc_TypeError,
8526 "* wants int");
8527 goto onError;
8528 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008529 prec = _PyInt_AsInt(v);
8530 if (prec == -1 && PyErr_Occurred())
8531 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008532 if (prec < 0)
8533 prec = 0;
8534 if (--fmtcnt >= 0)
8535 c = *fmt++;
8536 }
8537 else if (c >= '0' && c <= '9') {
8538 prec = c - '0';
8539 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008540 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008541 if (c < '0' || c > '9')
8542 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008543 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008544 PyErr_SetString(PyExc_ValueError,
8545 "prec too big");
8546 goto onError;
8547 }
8548 prec = prec*10 + (c - '0');
8549 }
8550 }
8551 } /* prec */
8552 if (fmtcnt >= 0) {
8553 if (c == 'h' || c == 'l' || c == 'L') {
8554 if (--fmtcnt >= 0)
8555 c = *fmt++;
8556 }
8557 }
8558 if (fmtcnt < 0) {
8559 PyErr_SetString(PyExc_ValueError,
8560 "incomplete format");
8561 goto onError;
8562 }
8563 if (c != '%') {
8564 v = getnextarg(args, arglen, &argidx);
8565 if (v == NULL)
8566 goto onError;
8567 }
8568 sign = 0;
8569 fill = ' ';
8570 switch (c) {
8571
8572 case '%':
8573 pbuf = formatbuf;
8574 /* presume that buffer length is at least 1 */
8575 pbuf[0] = '%';
8576 len = 1;
8577 break;
8578
8579 case 's':
8580 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008581 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008582 temp = v;
8583 Py_INCREF(temp);
8584 }
8585 else {
8586 PyObject *unicode;
8587 if (c == 's')
8588 temp = PyObject_Unicode(v);
8589 else
8590 temp = PyObject_Repr(v);
8591 if (temp == NULL)
8592 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008593 if (PyUnicode_Check(temp))
8594 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008595 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008596 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008597 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8598 PyString_GET_SIZE(temp),
8599 NULL,
8600 "strict");
8601 Py_DECREF(temp);
8602 temp = unicode;
8603 if (temp == NULL)
8604 goto onError;
8605 }
8606 else {
8607 Py_DECREF(temp);
8608 PyErr_SetString(PyExc_TypeError,
8609 "%s argument has non-string str()");
8610 goto onError;
8611 }
8612 }
8613 pbuf = PyUnicode_AS_UNICODE(temp);
8614 len = PyUnicode_GET_SIZE(temp);
8615 if (prec >= 0 && len > prec)
8616 len = prec;
8617 break;
8618
8619 case 'i':
8620 case 'd':
8621 case 'u':
8622 case 'o':
8623 case 'x':
8624 case 'X':
8625 if (c == 'i')
8626 c = 'd';
8627 isnumok = 0;
8628 if (PyNumber_Check(v)) {
8629 PyObject *iobj=NULL;
8630
8631 if (PyInt_Check(v) || (PyLong_Check(v))) {
8632 iobj = v;
8633 Py_INCREF(iobj);
8634 }
8635 else {
8636 iobj = PyNumber_Int(v);
Serhiy Storchaka149d0802016-04-10 15:26:52 +03008637 if (iobj==NULL) {
8638 PyErr_Clear();
8639 iobj = PyNumber_Long(v);
8640 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008641 }
8642 if (iobj!=NULL) {
8643 if (PyInt_Check(iobj)) {
8644 isnumok = 1;
8645 pbuf = formatbuf;
8646 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8647 flags, prec, c, iobj);
8648 Py_DECREF(iobj);
8649 if (len < 0)
8650 goto onError;
8651 sign = 1;
8652 }
8653 else if (PyLong_Check(iobj)) {
8654 isnumok = 1;
8655 temp = formatlong(iobj, flags, prec, c);
8656 Py_DECREF(iobj);
8657 if (!temp)
8658 goto onError;
8659 pbuf = PyUnicode_AS_UNICODE(temp);
8660 len = PyUnicode_GET_SIZE(temp);
8661 sign = 1;
8662 }
8663 else {
8664 Py_DECREF(iobj);
8665 }
8666 }
8667 }
8668 if (!isnumok) {
8669 PyErr_Format(PyExc_TypeError,
8670 "%%%c format: a number is required, "
8671 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8672 goto onError;
8673 }
8674 if (flags & F_ZERO)
8675 fill = '0';
8676 break;
8677
8678 case 'e':
8679 case 'E':
8680 case 'f':
8681 case 'F':
8682 case 'g':
8683 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008684 temp = formatfloat(v, flags, prec, c);
8685 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008686 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008687 pbuf = PyUnicode_AS_UNICODE(temp);
8688 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008689 sign = 1;
8690 if (flags & F_ZERO)
8691 fill = '0';
8692 break;
8693
8694 case 'c':
8695 pbuf = formatbuf;
8696 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8697 if (len < 0)
8698 goto onError;
8699 break;
8700
8701 default:
8702 PyErr_Format(PyExc_ValueError,
8703 "unsupported format character '%c' (0x%x) "
8704 "at index %zd",
8705 (31<=c && c<=126) ? (char)c : '?',
8706 (int)c,
8707 (Py_ssize_t)(fmt - 1 -
8708 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008709 goto onError;
8710 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008711 if (sign) {
8712 if (*pbuf == '-' || *pbuf == '+') {
8713 sign = *pbuf++;
8714 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008715 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008716 else if (flags & F_SIGN)
8717 sign = '+';
8718 else if (flags & F_BLANK)
8719 sign = ' ';
8720 else
8721 sign = 0;
8722 }
8723 if (width < len)
8724 width = len;
8725 if (rescnt - (sign != 0) < width) {
8726 reslen -= rescnt;
8727 rescnt = width + fmtcnt + 100;
8728 reslen += rescnt;
8729 if (reslen < 0) {
8730 Py_XDECREF(temp);
8731 PyErr_NoMemory();
8732 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008733 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008734 if (_PyUnicode_Resize(&result, reslen) < 0) {
8735 Py_XDECREF(temp);
8736 goto onError;
8737 }
8738 res = PyUnicode_AS_UNICODE(result)
8739 + reslen - rescnt;
8740 }
8741 if (sign) {
8742 if (fill != ' ')
8743 *res++ = sign;
8744 rescnt--;
8745 if (width > len)
8746 width--;
8747 }
8748 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8749 assert(pbuf[0] == '0');
8750 assert(pbuf[1] == c);
8751 if (fill != ' ') {
8752 *res++ = *pbuf++;
8753 *res++ = *pbuf++;
8754 }
8755 rescnt -= 2;
8756 width -= 2;
8757 if (width < 0)
8758 width = 0;
8759 len -= 2;
8760 }
8761 if (width > len && !(flags & F_LJUST)) {
8762 do {
8763 --rescnt;
8764 *res++ = fill;
8765 } while (--width > len);
8766 }
8767 if (fill == ' ') {
8768 if (sign)
8769 *res++ = sign;
8770 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8771 assert(pbuf[0] == '0');
8772 assert(pbuf[1] == c);
8773 *res++ = *pbuf++;
8774 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008775 }
8776 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008777 Py_UNICODE_COPY(res, pbuf, len);
8778 res += len;
8779 rescnt -= len;
8780 while (--width >= len) {
8781 --rescnt;
8782 *res++ = ' ';
8783 }
8784 if (dict && (argidx < arglen) && c != '%') {
8785 PyErr_SetString(PyExc_TypeError,
8786 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008787 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008788 goto onError;
8789 }
8790 Py_XDECREF(temp);
8791 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792 } /* until end */
8793 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008794 PyErr_SetString(PyExc_TypeError,
8795 "not all arguments converted during string formatting");
8796 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797 }
8798
Thomas Woutersa96affe2006-03-12 00:29:36 +00008799 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008800 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008802 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803 }
8804 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805 return (PyObject *)result;
8806
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008807 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808 Py_XDECREF(result);
8809 Py_DECREF(uformat);
8810 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008811 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812 }
8813 return NULL;
8814}
8815
8816static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008817 (readbufferproc) unicode_buffer_getreadbuf,
8818 (writebufferproc) unicode_buffer_getwritebuf,
8819 (segcountproc) unicode_buffer_getsegcount,
8820 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821};
8822
Jeremy Hylton938ace62002-07-17 16:30:39 +00008823static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008824unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8825
Tim Peters6d6c1a32001-08-02 04:15:00 +00008826static PyObject *
8827unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8828{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008829 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008830 static char *kwlist[] = {"string", "encoding", "errors", 0};
8831 char *encoding = NULL;
8832 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008833
Benjamin Peterson857ce152009-01-31 16:29:18 +00008834 if (type != &PyUnicode_Type)
8835 return unicode_subtype_new(type, args, kwds);
8836 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008837 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008838 return NULL;
8839 if (x == NULL)
8840 return (PyObject *)_PyUnicode_New(0);
8841 if (encoding == NULL && errors == NULL)
8842 return PyObject_Unicode(x);
8843 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008844 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008845}
8846
Guido van Rossume023fe02001-08-30 03:12:59 +00008847static PyObject *
8848unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8849{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008850 PyUnicodeObject *tmp, *pnew;
8851 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008852
Benjamin Peterson857ce152009-01-31 16:29:18 +00008853 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8854 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8855 if (tmp == NULL)
8856 return NULL;
8857 assert(PyUnicode_Check(tmp));
8858 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8859 if (pnew == NULL) {
8860 Py_DECREF(tmp);
8861 return NULL;
8862 }
8863 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8864 if (pnew->str == NULL) {
8865 _Py_ForgetReference((PyObject *)pnew);
8866 PyObject_Del(pnew);
8867 Py_DECREF(tmp);
8868 return PyErr_NoMemory();
8869 }
8870 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8871 pnew->length = n;
8872 pnew->hash = tmp->hash;
8873 Py_DECREF(tmp);
8874 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008875}
8876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008877PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008878 "unicode(object='') -> unicode object\n\
8879unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008880\n\
8881Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008882encoding defaults to the current default string encoding.\n\
8883errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008884
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008886 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008887 "unicode", /* tp_name */
8888 sizeof(PyUnicodeObject), /* tp_size */
8889 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008891 (destructor)unicode_dealloc, /* tp_dealloc */
8892 0, /* tp_print */
8893 0, /* tp_getattr */
8894 0, /* tp_setattr */
8895 0, /* tp_compare */
8896 unicode_repr, /* tp_repr */
8897 &unicode_as_number, /* tp_as_number */
8898 &unicode_as_sequence, /* tp_as_sequence */
8899 &unicode_as_mapping, /* tp_as_mapping */
8900 (hashfunc) unicode_hash, /* tp_hash*/
8901 0, /* tp_call*/
8902 (reprfunc) unicode_str, /* tp_str */
8903 PyObject_GenericGetAttr, /* tp_getattro */
8904 0, /* tp_setattro */
8905 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008906 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008907 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008908 unicode_doc, /* tp_doc */
8909 0, /* tp_traverse */
8910 0, /* tp_clear */
8911 PyUnicode_RichCompare, /* tp_richcompare */
8912 0, /* tp_weaklistoffset */
8913 0, /* tp_iter */
8914 0, /* tp_iternext */
8915 unicode_methods, /* tp_methods */
8916 0, /* tp_members */
8917 0, /* tp_getset */
8918 &PyBaseString_Type, /* tp_base */
8919 0, /* tp_dict */
8920 0, /* tp_descr_get */
8921 0, /* tp_descr_set */
8922 0, /* tp_dictoffset */
8923 0, /* tp_init */
8924 0, /* tp_alloc */
8925 unicode_new, /* tp_new */
8926 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927};
8928
8929/* Initialize the Unicode implementation */
8930
Thomas Wouters78890102000-07-22 19:25:51 +00008931void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008933 /* XXX - move this array to unicodectype.c ? */
8934 Py_UNICODE linebreak[] = {
8935 0x000A, /* LINE FEED */
8936 0x000D, /* CARRIAGE RETURN */
8937 0x001C, /* FILE SEPARATOR */
8938 0x001D, /* GROUP SEPARATOR */
8939 0x001E, /* RECORD SEPARATOR */
8940 0x0085, /* NEXT LINE */
8941 0x2028, /* LINE SEPARATOR */
8942 0x2029, /* PARAGRAPH SEPARATOR */
8943 };
8944
Fred Drakee4315f52000-05-09 19:53:39 +00008945 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008946 if (!unicode_empty) {
8947 unicode_empty = _PyUnicode_New(0);
8948 if (!unicode_empty)
8949 return;
8950 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008951
Guido van Rossumcacfc072002-05-24 19:01:59 +00008952 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008953 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008954
8955 /* initialize the linebreak bloom filter */
8956 bloom_linebreak = make_bloom_mask(
8957 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8958 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008959
8960 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008961
8962 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8963 Py_FatalError("Can't initialize field name iterator type");
8964
8965 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8966 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967}
8968
8969/* Finalize the Unicode implementation */
8970
Christian Heimes3b718a72008-02-14 12:47:33 +00008971int
8972PyUnicode_ClearFreeList(void)
8973{
8974 int freelist_size = numfree;
8975 PyUnicodeObject *u;
8976
8977 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008978 PyUnicodeObject *v = u;
8979 u = *(PyUnicodeObject **)u;
8980 if (v->str)
8981 PyObject_DEL(v->str);
8982 Py_XDECREF(v->defenc);
8983 PyObject_Del(v);
8984 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008985 }
8986 free_list = NULL;
8987 assert(numfree == 0);
8988 return freelist_size;
8989}
8990
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991void
Thomas Wouters78890102000-07-22 19:25:51 +00008992_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008994 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008996 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008997
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008998 for (i = 0; i < 256; i++)
8999 Py_CLEAR(unicode_latin1[i]);
9000
Christian Heimes3b718a72008-02-14 12:47:33 +00009001 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009003
Anthony Baxterac6bd462006-04-13 02:06:09 +00009004#ifdef __cplusplus
9005}
9006#endif