blob: 08723ac9b868b11b99fff3ebb04067ac57f0e0a1 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Serhiy Storchakae822b032013-08-06 16:56:26 +0300550/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551 * by 'ptr', possibly combining surrogate pairs on narrow builds.
552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553 * that should be returned and 'end' pointing to the end of the buffer.
554 * ('end' is used on narrow builds to detect a lone surrogate at the
555 * end of the buffer that should be returned unchanged.)
556 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557 * The type of the returned char is always Py_UCS4.
558 *
559 * Note: the macro advances ptr to next char, so it might have side-effects
560 * (especially if used with other macros).
561 */
562
563/* helper macros used by _Py_UNICODE_NEXT */
564#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566/* Join two surrogate characters and return a single Py_UCS4 value. */
567#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
570
571#ifdef Py_UNICODE_WIDE
572#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
573#else
574#define _Py_UNICODE_NEXT(ptr, end) \
575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
578 (Py_UCS4)*(ptr)++)
579#endif
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581#ifdef HAVE_WCHAR_H
582
Mark Dickinson6b265f12009-03-18 16:07:26 +0000583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594{
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000643 PyErr_BadInternalCall();
644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 }
646
647 unicode = _PyUnicode_New(size);
648 if (!unicode)
649 return NULL;
650
651 /* Copy the wchar_t data into the new object */
652#ifdef HAVE_USABLE_WCHAR_T
653 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000654#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000656 register Py_UNICODE *u;
657 register Py_ssize_t i;
658 u = PyUnicode_AS_UNICODE(unicode);
659 for (i = size; i > 0; i--)
660 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662#endif
663
664 return (PyObject *)unicode;
665}
666
Mark Dickinson6b265f12009-03-18 16:07:26 +0000667#endif /* CONVERT_WCHAR_TO_SURROGATES */
668
669#undef CONVERT_WCHAR_TO_SURROGATES
670
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000671static void
672makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
673{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000674 *fmt++ = '%';
675 if (width) {
676 if (zeropad)
677 *fmt++ = '0';
678 fmt += sprintf(fmt, "%d", width);
679 }
680 if (precision)
681 fmt += sprintf(fmt, ".%d", precision);
682 if (longflag)
683 *fmt++ = 'l';
684 else if (size_tflag) {
685 char *f = PY_FORMAT_SIZE_T;
686 while (*f)
687 *fmt++ = *f++;
688 }
689 *fmt++ = c;
690 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000691}
692
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200693#define appendstring(string) \
694 do { \
695 for (copy = string;*copy; copy++) { \
696 *s++ = (unsigned char)*copy; \
697 } \
698 } while (0)
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000699
700PyObject *
701PyUnicode_FromFormatV(const char *format, va_list vargs)
702{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000703 va_list count;
704 Py_ssize_t callcount = 0;
705 PyObject **callresults = NULL;
706 PyObject **callresult = NULL;
707 Py_ssize_t n = 0;
708 int width = 0;
709 int precision = 0;
710 int zeropad;
711 const char* f;
712 Py_UNICODE *s;
713 PyObject *string;
714 /* used by sprintf */
715 char buffer[21];
716 /* use abuffer instead of buffer, if we need more space
717 * (which can happen if there's a format specifier with width). */
718 char *abuffer = NULL;
719 char *realbuffer;
720 Py_ssize_t abuffersize = 0;
721 char fmt[60]; /* should be enough for %0width.precisionld */
722 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000723
724#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726#else
727#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000728 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000729#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000730 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000731#endif
732#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000733 /* step 1: count the number of %S/%R/%s format specifications
734 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
735 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000736 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000737 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200738 f++;
739 while (*f && *f != '%' && !isalpha((unsigned)*f))
740 f++;
Serhiy Storchaka227526d2015-01-31 01:15:29 +0200741 if (!*f)
742 break;
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200743 if (*f == 's' || *f=='S' || *f=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000744 ++callcount;
745 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000746 }
747 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000748 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000749 if (callcount) {
750 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
751 if (!callresults) {
752 PyErr_NoMemory();
753 return NULL;
754 }
755 callresult = callresults;
756 }
757 /* step 3: figure out how large a buffer we need */
758 for (f = format; *f; f++) {
759 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200760 const char* p = f++;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000761 width = 0;
762 while (isdigit((unsigned)*f))
763 width = (width*10) + *f++ - '0';
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200764 precision = 0;
765 if (*f == '.') {
766 f++;
767 while (isdigit((unsigned)*f))
768 precision = (precision*10) + *f++ - '0';
769 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000770
Benjamin Peterson857ce152009-01-31 16:29:18 +0000771 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
772 * they don't affect the amount of space we reserve.
773 */
774 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000775 (f[1] == 'd' || f[1] == 'u'))
776 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000777
Benjamin Peterson857ce152009-01-31 16:29:18 +0000778 switch (*f) {
779 case 'c':
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300780 {
781 int ordinal = va_arg(count, int);
782#ifdef Py_UNICODE_WIDE
783 if (ordinal < 0 || ordinal > 0x10ffff) {
784 PyErr_SetString(PyExc_OverflowError,
785 "%c arg not in range(0x110000) "
786 "(wide Python build)");
787 goto fail;
788 }
789#else
790 if (ordinal < 0 || ordinal > 0xffff) {
791 PyErr_SetString(PyExc_OverflowError,
792 "%c arg not in range(0x10000) "
793 "(narrow Python build)");
794 goto fail;
795 }
796#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000797 /* fall through... */
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300798 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000799 case '%':
800 n++;
801 break;
802 case 'd': case 'u': case 'i': case 'x':
803 (void) va_arg(count, int);
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200804 if (width < precision)
805 width = precision;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000806 /* 20 bytes is enough to hold a 64-bit
807 integer. Decimal takes the most space.
808 This isn't enough for octal.
809 If a width is specified we need more
810 (which we allocate later). */
811 if (width < 20)
812 width = 20;
813 n += width;
814 if (abuffersize < width)
815 abuffersize = width;
816 break;
817 case 's':
818 {
819 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000820 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000821 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
822 if (!str)
823 goto fail;
824 n += PyUnicode_GET_SIZE(str);
825 /* Remember the str and switch to the next slot */
826 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000827 break;
828 }
829 case 'U':
830 {
831 PyObject *obj = va_arg(count, PyObject *);
832 assert(obj && PyUnicode_Check(obj));
833 n += PyUnicode_GET_SIZE(obj);
834 break;
835 }
836 case 'V':
837 {
838 PyObject *obj = va_arg(count, PyObject *);
839 const char *str = va_arg(count, const char *);
840 assert(obj || str);
841 assert(!obj || PyUnicode_Check(obj));
842 if (obj)
843 n += PyUnicode_GET_SIZE(obj);
844 else
845 n += strlen(str);
846 break;
847 }
848 case 'S':
849 {
850 PyObject *obj = va_arg(count, PyObject *);
851 PyObject *str;
852 assert(obj);
853 str = PyObject_Str(obj);
854 if (!str)
855 goto fail;
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200856 n += PyString_GET_SIZE(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000857 /* Remember the str and switch to the next slot */
858 *callresult++ = str;
859 break;
860 }
861 case 'R':
862 {
863 PyObject *obj = va_arg(count, PyObject *);
864 PyObject *repr;
865 assert(obj);
866 repr = PyObject_Repr(obj);
867 if (!repr)
868 goto fail;
869 n += PyUnicode_GET_SIZE(repr);
870 /* Remember the repr and switch to the next slot */
871 *callresult++ = repr;
872 break;
873 }
874 case 'p':
875 (void) va_arg(count, int);
876 /* maximum 64-bit pointer representation:
877 * 0xffffffffffffffff
878 * so 19 characters is enough.
879 * XXX I count 18 -- what's the extra for?
880 */
881 n += 19;
882 break;
883 default:
884 /* if we stumble upon an unknown
885 formatting code, copy the rest of
886 the format string to the output
887 string. (we cannot just skip the
888 code, since there's no way to know
889 what's in the argument list) */
890 n += strlen(p);
891 goto expand;
892 }
893 } else
894 n++;
895 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000896 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000897 if (abuffersize > 20) {
Serhiy Storchaka5ec0bbf2015-01-30 23:35:03 +0200898 /* add 1 for sprintf's trailing null byte */
899 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000900 if (!abuffer) {
901 PyErr_NoMemory();
902 goto fail;
903 }
904 realbuffer = abuffer;
905 }
906 else
907 realbuffer = buffer;
908 /* step 4: fill the buffer */
909 /* Since we've analyzed how much space we need for the worst case,
910 we don't have to resize the string.
911 There can be no errors beyond this point. */
912 string = PyUnicode_FromUnicode(NULL, n);
913 if (!string)
914 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000915
Benjamin Peterson857ce152009-01-31 16:29:18 +0000916 s = PyUnicode_AS_UNICODE(string);
917 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000918
Benjamin Peterson857ce152009-01-31 16:29:18 +0000919 for (f = format; *f; f++) {
920 if (*f == '%') {
921 const char* p = f++;
922 int longflag = 0;
923 int size_tflag = 0;
924 zeropad = (*f == '0');
925 /* parse the width.precision part */
926 width = 0;
927 while (isdigit((unsigned)*f))
928 width = (width*10) + *f++ - '0';
929 precision = 0;
930 if (*f == '.') {
931 f++;
932 while (isdigit((unsigned)*f))
933 precision = (precision*10) + *f++ - '0';
934 }
935 /* handle the long flag, but only for %ld and %lu.
936 others can be added when necessary. */
937 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
938 longflag = 1;
939 ++f;
940 }
941 /* handle the size_t flag. */
942 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
943 size_tflag = 1;
944 ++f;
945 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000946
Benjamin Peterson857ce152009-01-31 16:29:18 +0000947 switch (*f) {
948 case 'c':
949 *s++ = va_arg(vargs, int);
950 break;
951 case 'd':
952 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
953 if (longflag)
954 sprintf(realbuffer, fmt, va_arg(vargs, long));
955 else if (size_tflag)
956 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
957 else
958 sprintf(realbuffer, fmt, va_arg(vargs, int));
959 appendstring(realbuffer);
960 break;
961 case 'u':
962 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
963 if (longflag)
964 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
965 else if (size_tflag)
966 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
967 else
968 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
969 appendstring(realbuffer);
970 break;
971 case 'i':
972 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
973 sprintf(realbuffer, fmt, va_arg(vargs, int));
974 appendstring(realbuffer);
975 break;
976 case 'x':
977 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
978 sprintf(realbuffer, fmt, va_arg(vargs, int));
979 appendstring(realbuffer);
980 break;
981 case 's':
982 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000983 /* unused, since we already have the result */
984 (void) va_arg(vargs, char *);
985 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
986 PyUnicode_GET_SIZE(*callresult));
987 s += PyUnicode_GET_SIZE(*callresult);
988 /* We're done with the unicode()/repr() => forget it */
989 Py_DECREF(*callresult);
990 /* switch to next unicode()/repr() result */
991 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000992 break;
993 }
994 case 'U':
995 {
996 PyObject *obj = va_arg(vargs, PyObject *);
997 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
998 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
999 s += size;
1000 break;
1001 }
1002 case 'V':
1003 {
1004 PyObject *obj = va_arg(vargs, PyObject *);
1005 const char *str = va_arg(vargs, const char *);
1006 if (obj) {
1007 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1008 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1009 s += size;
1010 } else {
1011 appendstring(str);
1012 }
1013 break;
1014 }
1015 case 'S':
1016 case 'R':
1017 {
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001018 const char *str = PyString_AS_STRING(*callresult);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001019 /* unused, since we already have the result */
1020 (void) va_arg(vargs, PyObject *);
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001021 appendstring(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 /* We're done with the unicode()/repr() => forget it */
1023 Py_DECREF(*callresult);
1024 /* switch to next unicode()/repr() result */
1025 ++callresult;
1026 break;
1027 }
1028 case 'p':
1029 sprintf(buffer, "%p", va_arg(vargs, void*));
1030 /* %p is ill-defined: ensure leading 0x. */
1031 if (buffer[1] == 'X')
1032 buffer[1] = 'x';
1033 else if (buffer[1] != 'x') {
1034 memmove(buffer+2, buffer, strlen(buffer)+1);
1035 buffer[0] = '0';
1036 buffer[1] = 'x';
1037 }
1038 appendstring(buffer);
1039 break;
1040 case '%':
1041 *s++ = '%';
1042 break;
1043 default:
1044 appendstring(p);
1045 goto end;
1046 }
1047 } else
1048 *s++ = *f;
1049 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001050
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001051 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001052 if (callresults)
1053 PyObject_Free(callresults);
1054 if (abuffer)
1055 PyObject_Free(abuffer);
1056 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1057 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001058 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001059 if (callresults) {
1060 PyObject **callresult2 = callresults;
1061 while (callresult2 < callresult) {
1062 Py_DECREF(*callresult2);
1063 ++callresult2;
1064 }
1065 PyObject_Free(callresults);
1066 }
1067 if (abuffer)
1068 PyObject_Free(abuffer);
1069 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001070}
1071
1072#undef appendstring
1073
1074PyObject *
1075PyUnicode_FromFormat(const char *format, ...)
1076{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001077 PyObject* ret;
1078 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001079
1080#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001081 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001082#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001083 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001084#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001085 ret = PyUnicode_FromFormatV(format, vargs);
1086 va_end(vargs);
1087 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001088}
1089
Martin v. Löwis18e16552006-02-15 17:27:45 +00001090Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 wchar_t *w,
1092 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093{
1094 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001095 PyErr_BadInternalCall();
1096 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001098
1099 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001101 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001102
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103#ifdef HAVE_USABLE_WCHAR_T
1104 memcpy(w, unicode->str, size * sizeof(wchar_t));
1105#else
1106 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001107 register Py_UNICODE *u;
1108 register Py_ssize_t i;
1109 u = PyUnicode_AS_UNICODE(unicode);
1110 for (i = size; i > 0; i--)
1111 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 }
1113#endif
1114
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001115 if (size > PyUnicode_GET_SIZE(unicode))
1116 return PyUnicode_GET_SIZE(unicode);
1117 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001118 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119}
1120
1121#endif
1122
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001123PyObject *PyUnicode_FromOrdinal(int ordinal)
1124{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001125 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001126
1127#ifdef Py_UNICODE_WIDE
1128 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001129 PyErr_SetString(PyExc_ValueError,
1130 "unichr() arg not in range(0x110000) "
1131 "(wide Python build)");
1132 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001133 }
1134#else
1135 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001136 PyErr_SetString(PyExc_ValueError,
1137 "unichr() arg not in range(0x10000) "
1138 "(narrow Python build)");
1139 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001140 }
1141#endif
1142
Hye-Shik Chang40574832004-04-06 07:24:51 +00001143 s[0] = (Py_UNICODE)ordinal;
1144 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001145}
1146
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147PyObject *PyUnicode_FromObject(register PyObject *obj)
1148{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001149 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001150 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001151 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001152 Py_INCREF(obj);
1153 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001154 }
1155 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001156 /* For a Unicode subtype that's not a Unicode object,
1157 return a true Unicode object with the same data. */
1158 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1159 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001160 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001161 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1162}
1163
1164PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001165 const char *encoding,
1166 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001167{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001168 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001169 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001170 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001171
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001173 PyErr_BadInternalCall();
1174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001176
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001177#if 0
1178 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001179 that no encodings is given and then redirect to
1180 PyObject_Unicode() which then applies the additional logic for
1181 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001182
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001183 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001184 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001185
1186 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001187 if (PyUnicode_Check(obj)) {
1188 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001189 PyErr_SetString(PyExc_TypeError,
1190 "decoding Unicode is not supported");
1191 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001192 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001193 return PyObject_Unicode(obj);
1194 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001195#else
1196 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001197 PyErr_SetString(PyExc_TypeError,
1198 "decoding Unicode is not supported");
1199 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001200 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001201#endif
1202
1203 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001204 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001205 s = PyString_AS_STRING(obj);
1206 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001207 }
Christian Heimes3497f942008-05-26 12:29:14 +00001208 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001209 /* Python 2.x specific */
1210 PyErr_Format(PyExc_TypeError,
1211 "decoding bytearray is not supported");
1212 return NULL;
1213 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001214 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001215 /* Overwrite the error message with something more useful in
1216 case of a TypeError. */
1217 if (PyErr_ExceptionMatches(PyExc_TypeError))
1218 PyErr_Format(PyExc_TypeError,
1219 "coercing to Unicode: need string or buffer, "
1220 "%.80s found",
1221 Py_TYPE(obj)->tp_name);
1222 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001223 }
Tim Petersced69f82003-09-16 20:30:58 +00001224
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001225 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001226 if (len == 0)
1227 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001228
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001229 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001230 return v;
1231
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001232 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001233 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234}
1235
1236PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001237 Py_ssize_t size,
1238 const char *encoding,
1239 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240{
1241 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001242
1243 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001244 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001245
1246 /* Shortcuts for common default encodings */
1247 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001249 else if (strcmp(encoding, "latin-1") == 0)
1250 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001251#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1252 else if (strcmp(encoding, "mbcs") == 0)
1253 return PyUnicode_DecodeMBCS(s, size, errors);
1254#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001255 else if (strcmp(encoding, "ascii") == 0)
1256 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001257
1258 /* Decode via the codec registry */
1259 buffer = PyBuffer_FromMemory((void *)s, size);
1260 if (buffer == NULL)
1261 goto onError;
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001262 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 if (unicode == NULL)
1264 goto onError;
1265 if (!PyUnicode_Check(unicode)) {
1266 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001267 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001268 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269 Py_DECREF(unicode);
1270 goto onError;
1271 }
1272 Py_DECREF(buffer);
1273 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001274
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001275 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 Py_XDECREF(buffer);
1277 return NULL;
1278}
1279
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001280PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1281 const char *encoding,
1282 const char *errors)
1283{
1284 PyObject *v;
1285
1286 if (!PyUnicode_Check(unicode)) {
1287 PyErr_BadArgument();
1288 goto onError;
1289 }
1290
1291 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001292 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001293
1294 /* Decode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001295 v = _PyCodec_DecodeText(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001296 if (v == NULL)
1297 goto onError;
1298 return v;
1299
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001300 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001301 return NULL;
1302}
1303
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001305 Py_ssize_t size,
1306 const char *encoding,
1307 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308{
1309 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001310
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 unicode = PyUnicode_FromUnicode(s, size);
1312 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1315 Py_DECREF(unicode);
1316 return v;
1317}
1318
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001319PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1320 const char *encoding,
1321 const char *errors)
1322{
1323 PyObject *v;
1324
1325 if (!PyUnicode_Check(unicode)) {
1326 PyErr_BadArgument();
1327 goto onError;
1328 }
1329
1330 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001331 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001332
1333 /* Encode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001334 v = _PyCodec_EncodeText(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001335 if (v == NULL)
1336 goto onError;
1337 return v;
1338
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001339 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001340 return NULL;
1341}
1342
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1344 const char *encoding,
1345 const char *errors)
1346{
1347 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001348
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 if (!PyUnicode_Check(unicode)) {
1350 PyErr_BadArgument();
1351 goto onError;
1352 }
Fred Drakee4315f52000-05-09 19:53:39 +00001353
Tim Petersced69f82003-09-16 20:30:58 +00001354 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001355 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001356
1357 /* Shortcuts for common default encodings */
1358 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001359 if (strcmp(encoding, "utf-8") == 0)
1360 return PyUnicode_AsUTF8String(unicode);
1361 else if (strcmp(encoding, "latin-1") == 0)
1362 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001363#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001364 else if (strcmp(encoding, "mbcs") == 0)
1365 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001366#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001367 else if (strcmp(encoding, "ascii") == 0)
1368 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370
1371 /* Encode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001372 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373 if (v == NULL)
1374 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001375 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001377 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001378 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379 Py_DECREF(v);
1380 goto onError;
1381 }
1382 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001383
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001384 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385 return NULL;
1386}
1387
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001388PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001390{
1391 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1392
1393 if (v)
1394 return v;
1395 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1396 if (v && errors == NULL)
1397 ((PyUnicodeObject *)unicode)->defenc = v;
1398 return v;
1399}
1400
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1402{
1403 if (!PyUnicode_Check(unicode)) {
1404 PyErr_BadArgument();
1405 goto onError;
1406 }
1407 return PyUnicode_AS_UNICODE(unicode);
1408
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001409 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410 return NULL;
1411}
1412
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414{
1415 if (!PyUnicode_Check(unicode)) {
1416 PyErr_BadArgument();
1417 goto onError;
1418 }
1419 return PyUnicode_GET_SIZE(unicode);
1420
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001421 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422 return -1;
1423}
1424
Thomas Wouters78890102000-07-22 19:25:51 +00001425const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001426{
1427 return unicode_default_encoding;
1428}
1429
1430int PyUnicode_SetDefaultEncoding(const char *encoding)
1431{
1432 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001433
Fred Drakee4315f52000-05-09 19:53:39 +00001434 /* Make sure the encoding is valid. As side effect, this also
1435 loads the encoding into the codec registry cache. */
1436 v = _PyCodec_Lookup(encoding);
1437 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001438 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001439 Py_DECREF(v);
1440 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001442 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001443 return 0;
1444
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001445 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001446 return -1;
1447}
1448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449/* error handling callback helper:
1450 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001451 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452 and adjust various state variables.
1453 return 0 on success, -1 on error
1454*/
1455
1456static
1457int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001458 const char *encoding, const char *reason,
1459 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1460 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1461 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001462{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001463 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001464
1465 PyObject *restuple = NULL;
1466 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1468 Py_ssize_t requiredsize;
1469 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001470 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001471 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 int res = -1;
1473
1474 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001475 *errorHandler = PyCodec_LookupError(errors);
1476 if (*errorHandler == NULL)
1477 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001478 }
1479
1480 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001481 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001482 encoding, input, insize, *startinpos, *endinpos, reason);
1483 if (*exceptionObject == NULL)
1484 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001485 }
1486 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001487 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1488 goto onError;
1489 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1490 goto onError;
1491 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1492 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001493 }
1494
1495 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1496 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001497 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001499 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001500 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001501 }
1502 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001503 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001505 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001506 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001507 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1508 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001509 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001510
1511 /* need more space? (at least enough for what we
1512 have+the replacement+the rest of the string (starting
1513 at the new input position), so we won't have to check space
1514 when there are no errors in the rest of the string) */
1515 repptr = PyUnicode_AS_UNICODE(repunicode);
1516 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001517 requiredsize = *outpos;
1518 if (requiredsize > PY_SSIZE_T_MAX - repsize)
1519 goto overflow;
1520 requiredsize += repsize;
1521 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1522 goto overflow;
1523 requiredsize += insize - newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 if (requiredsize > outsize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001525 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001526 requiredsize = 2*outsize;
1527 if (_PyUnicode_Resize(output, requiredsize) < 0)
1528 goto onError;
1529 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 }
1531 *endinpos = newpos;
1532 *inptr = input + newpos;
1533 Py_UNICODE_COPY(*outptr, repptr, repsize);
1534 *outptr += repsize;
1535 *outpos += repsize;
1536 /* we made it! */
1537 res = 0;
1538
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001539 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 Py_XDECREF(restuple);
1541 return res;
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001542
1543 overflow:
1544 PyErr_SetString(PyExc_OverflowError,
1545 "decoded result is too long for a Python string");
1546 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547}
1548
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549/* --- UTF-7 Codec -------------------------------------------------------- */
1550
Antoine Pitrou653dece2009-05-04 18:32:32 +00001551/* See RFC2152 for details. We encode conservatively and decode liberally. */
1552
1553/* Three simple macros defining base-64. */
1554
1555/* Is c a base-64 character? */
1556
1557#define IS_BASE64(c) \
1558 (isalnum(c) || (c) == '+' || (c) == '/')
1559
1560/* given that c is a base-64 character, what is its base-64 value? */
1561
1562#define FROM_BASE64(c) \
1563 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1564 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1565 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1566 (c) == '+' ? 62 : 63)
1567
1568/* What is the base-64 character of the bottom 6 bits of n? */
1569
1570#define TO_BASE64(n) \
1571 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1572
1573/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1574 * decoded as itself. We are permissive on decoding; the only ASCII
1575 * byte not decoding to itself is the + which begins a base64
1576 * string. */
1577
1578#define DECODE_DIRECT(c) \
1579 ((c) <= 127 && (c) != '+')
1580
1581/* The UTF-7 encoder treats ASCII characters differently according to
1582 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1583 * the above). See RFC2152. This array identifies these different
1584 * sets:
1585 * 0 : "Set D"
1586 * alphanumeric and '(),-./:?
1587 * 1 : "Set O"
1588 * !"#$%&*;<=>@[]^_`{|}
1589 * 2 : "whitespace"
1590 * ht nl cr sp
1591 * 3 : special (must be base64 encoded)
1592 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1593 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594
Tim Petersced69f82003-09-16 20:30:58 +00001595static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001596char utf7_category[128] = {
1597/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1598 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1599/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1600 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1601/* sp ! " # $ % & ' ( ) * + , - . / */
1602 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1603/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1604 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1605/* @ A B C D E F G H I J K L M N O */
1606 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1607/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1608 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1609/* ` a b c d e f g h i j k l m n o */
1610 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1611/* p q r s t u v w x y z { | } ~ del */
1612 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613};
1614
Antoine Pitrou653dece2009-05-04 18:32:32 +00001615/* ENCODE_DIRECT: this character should be encoded as itself. The
1616 * answer depends on whether we are encoding set O as itself, and also
1617 * on whether we are encoding whitespace as itself. RFC2152 makes it
1618 * clear that the answers to these questions vary between
1619 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001620
Antoine Pitrou653dece2009-05-04 18:32:32 +00001621#define ENCODE_DIRECT(c, directO, directWS) \
1622 ((c) < 128 && (c) > 0 && \
1623 ((utf7_category[(c)] == 0) || \
1624 (directWS && (utf7_category[(c)] == 2)) || \
1625 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001628 Py_ssize_t size,
1629 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001631 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1632}
1633
Antoine Pitrou653dece2009-05-04 18:32:32 +00001634/* The decoder. The only state we preserve is our read position,
1635 * i.e. how many characters we have consumed. So if we end in the
1636 * middle of a shift sequence we have to back off the read position
1637 * and the output to the beginning of the sequence, otherwise we lose
1638 * all the shift state (seen bits, number of bits seen, high
1639 * surrogate). */
1640
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001641PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001642 Py_ssize_t size,
1643 const char *errors,
1644 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001645{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001647 Py_ssize_t startinpos;
1648 Py_ssize_t endinpos;
1649 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 const char *e;
1651 PyUnicodeObject *unicode;
1652 Py_UNICODE *p;
1653 const char *errmsg = "";
1654 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001655 Py_UNICODE *shiftOutStart;
1656 unsigned int base64bits = 0;
1657 unsigned long base64buffer = 0;
1658 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659 PyObject *errorHandler = NULL;
1660 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001661
1662 unicode = _PyUnicode_New(size);
1663 if (!unicode)
1664 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001665 if (size == 0) {
1666 if (consumed)
1667 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001668 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001669 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670
1671 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001672 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673 e = s + size;
1674
1675 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001676 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677
Antoine Pitrou653dece2009-05-04 18:32:32 +00001678 if (inShift) { /* in a base-64 section */
1679 if (IS_BASE64(ch)) { /* consume a base-64 character */
1680 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1681 base64bits += 6;
1682 s++;
1683 if (base64bits >= 16) {
1684 /* we have enough bits for a UTF-16 value */
1685 Py_UNICODE outCh = (Py_UNICODE)
1686 (base64buffer >> (base64bits-16));
1687 base64bits -= 16;
1688 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001689 assert(outCh <= 0xffff);
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 if (surrogate) {
1691 /* expecting a second surrogate */
1692 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1693#ifdef Py_UNICODE_WIDE
1694 *p++ = (((surrogate & 0x3FF)<<10)
1695 | (outCh & 0x3FF)) + 0x10000;
1696#else
1697 *p++ = surrogate;
1698 *p++ = outCh;
1699#endif
1700 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001701 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001702 }
1703 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001704 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001706 }
1707 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001708 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001709 /* first surrogate */
1710 surrogate = outCh;
1711 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001712 else {
1713 *p++ = outCh;
1714 }
1715 }
1716 }
1717 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001718 inShift = 0;
1719 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001720 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001721 *p++ = surrogate;
1722 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001723 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001724 if (base64bits > 0) { /* left-over bits */
1725 if (base64bits >= 6) {
1726 /* We've seen at least one base-64 character */
1727 errmsg = "partial character in shift sequence";
1728 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001729 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001730 else {
1731 /* Some bits remain; they should be zero */
1732 if (base64buffer != 0) {
1733 errmsg = "non-zero padding bits in shift sequence";
1734 goto utf7Error;
1735 }
1736 }
1737 }
1738 if (ch != '-') {
1739 /* '-' is absorbed; other terminating
1740 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001741 *p++ = ch;
1742 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001743 }
1744 }
1745 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001747 s++; /* consume '+' */
1748 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 s++;
1750 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001751 }
1752 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001753 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001754 shiftOutStart = p;
1755 base64bits = 0;
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001756 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001757 }
1758 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001759 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001760 *p++ = ch;
1761 s++;
1762 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001763 else {
1764 startinpos = s-starts;
1765 s++;
1766 errmsg = "unexpected special character";
1767 goto utf7Error;
1768 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001769 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001770utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001771 outpos = p-PyUnicode_AS_UNICODE(unicode);
1772 endinpos = s-starts;
1773 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001774 errors, &errorHandler,
1775 "utf7", errmsg,
1776 starts, size, &startinpos, &endinpos, &exc, &s,
1777 &unicode, &outpos, &p))
1778 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779 }
1780
Antoine Pitrou653dece2009-05-04 18:32:32 +00001781 /* end of string */
1782
1783 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1784 /* if we're in an inconsistent state, that's an error */
1785 if (surrogate ||
1786 (base64bits >= 6) ||
1787 (base64bits > 0 && base64buffer != 0)) {
1788 outpos = p-PyUnicode_AS_UNICODE(unicode);
1789 endinpos = size;
1790 if (unicode_decode_call_errorhandler(
1791 errors, &errorHandler,
1792 "utf7", "unterminated shift sequence",
1793 starts, size, &startinpos, &endinpos, &exc, &s,
1794 &unicode, &outpos, &p))
1795 goto onError;
1796 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001798
1799 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001800 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001801 if (inShift) {
1802 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001803 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 }
1805 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001806 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001807 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001808 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001809
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001810 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001811 goto onError;
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 Py_XDECREF(errorHandler);
1814 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001815 return (PyObject *)unicode;
1816
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001817 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 Py_XDECREF(errorHandler);
1819 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001820 Py_DECREF(unicode);
1821 return NULL;
1822}
1823
1824
1825PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001826 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001827 int base64SetO,
1828 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001829 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001830{
1831 PyObject *v;
1832 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001833 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001835 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001836 unsigned int base64bits = 0;
1837 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001838 char * out;
1839 char * start;
1840
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001841 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001842 return PyErr_NoMemory();
1843
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001845 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001846
Antoine Pitrou653dece2009-05-04 18:32:32 +00001847 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001848 if (v == NULL)
1849 return NULL;
1850
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001851 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001852 for (;i < size; ++i) {
1853 Py_UNICODE ch = s[i];
1854
Antoine Pitrou653dece2009-05-04 18:32:32 +00001855 if (inShift) {
1856 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1857 /* shifting out */
1858 if (base64bits) { /* output remaining bits */
1859 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1860 base64buffer = 0;
1861 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001862 }
1863 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001864 /* Characters not in the BASE64 set implicitly unshift the sequence
1865 so no '-' is required, except if the character is itself a '-' */
1866 if (IS_BASE64(ch) || ch == '-') {
1867 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001868 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001869 *out++ = (char) ch;
1870 }
1871 else {
1872 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001873 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001874 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001875 else { /* not in a shift sequence */
1876 if (ch == '+') {
1877 *out++ = '+';
1878 *out++ = '-';
1879 }
1880 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1881 *out++ = (char) ch;
1882 }
1883 else {
1884 *out++ = '+';
1885 inShift = 1;
1886 goto encode_char;
1887 }
1888 }
1889 continue;
1890encode_char:
1891#ifdef Py_UNICODE_WIDE
1892 if (ch >= 0x10000) {
1893 /* code first surrogate */
1894 base64bits += 16;
1895 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1896 while (base64bits >= 6) {
1897 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1898 base64bits -= 6;
1899 }
1900 /* prepare second surrogate */
1901 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1902 }
1903#endif
1904 base64bits += 16;
1905 base64buffer = (base64buffer << 16) | ch;
1906 while (base64bits >= 6) {
1907 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1908 base64bits -= 6;
1909 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001910 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001911 if (base64bits)
1912 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1913 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001914 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001915
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001916 if (_PyString_Resize(&v, out - start))
1917 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001918 return v;
1919}
1920
Antoine Pitrou653dece2009-05-04 18:32:32 +00001921#undef IS_BASE64
1922#undef FROM_BASE64
1923#undef TO_BASE64
1924#undef DECODE_DIRECT
1925#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001926
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927/* --- UTF-8 Codec -------------------------------------------------------- */
1928
Tim Petersced69f82003-09-16 20:30:58 +00001929static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001931 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1932 illegal prefix. See RFC 3629 for details */
1933 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1934 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001935 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1937 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1938 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1939 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1941 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001942 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1943 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001944 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1945 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1946 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1947 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1948 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949};
1950
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001952 Py_ssize_t size,
1953 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954{
Walter Dörwald69652032004-09-07 20:24:22 +00001955 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1956}
1957
1958PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001959 Py_ssize_t size,
1960 const char *errors,
1961 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001962{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001963 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001965 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001966 Py_ssize_t startinpos;
1967 Py_ssize_t endinpos;
1968 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 const char *e;
1970 PyUnicodeObject *unicode;
1971 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001972 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001973 PyObject *errorHandler = NULL;
1974 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975
1976 /* Note: size will always be longer than the resulting Unicode
1977 character count */
1978 unicode = _PyUnicode_New(size);
1979 if (!unicode)
1980 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001981 if (size == 0) {
1982 if (consumed)
1983 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986
1987 /* Unpack UTF-8 encoded data */
1988 p = unicode->str;
1989 e = s + size;
1990
1991 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001992 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993
1994 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001995 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996 s++;
1997 continue;
1998 }
1999
2000 n = utf8_code_length[ch];
2001
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002002 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002003 if (consumed)
2004 break;
2005 else {
2006 errmsg = "unexpected end of data";
2007 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002008 endinpos = startinpos+1;
2009 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2010 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002011 goto utf8Error;
2012 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002014
2015 switch (n) {
2016
2017 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002018 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002019 startinpos = s-starts;
2020 endinpos = startinpos+1;
2021 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022
2023 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002024 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002025 startinpos = s-starts;
2026 endinpos = startinpos+1;
2027 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028
2029 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002030 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00002031 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002032 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002033 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002034 goto utf8Error;
2035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002037 assert ((ch > 0x007F) && (ch <= 0x07FF));
2038 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 break;
2040
2041 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002042 /* XXX: surrogates shouldn't be valid UTF-8!
2043 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2044 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2045 Uncomment the 2 lines below to make them invalid,
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002046 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00002047 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002048 (s[2] & 0xc0) != 0x80 ||
2049 ((unsigned char)s[0] == 0xE0 &&
2050 (unsigned char)s[1] < 0xA0)/* ||
2051 ((unsigned char)s[0] == 0xED &&
2052 (unsigned char)s[1] > 0x9F)*/) {
2053 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002054 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002055 endinpos = startinpos + 1;
2056
2057 /* if s[1] first two bits are 1 and 0, then the invalid
2058 continuation byte is s[2], so increment endinpos by 1,
2059 if not, s[1] is invalid and endinpos doesn't need to
2060 be incremented. */
2061 if ((s[1] & 0xC0) == 0x80)
2062 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002063 goto utf8Error;
2064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002066 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2067 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002068 break;
2069
2070 case 4:
2071 if ((s[1] & 0xc0) != 0x80 ||
2072 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002073 (s[3] & 0xc0) != 0x80 ||
2074 ((unsigned char)s[0] == 0xF0 &&
2075 (unsigned char)s[1] < 0x90) ||
2076 ((unsigned char)s[0] == 0xF4 &&
2077 (unsigned char)s[1] > 0x8F)) {
2078 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002079 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002080 endinpos = startinpos + 1;
2081 if ((s[1] & 0xC0) == 0x80) {
2082 endinpos++;
2083 if ((s[2] & 0xC0) == 0x80)
2084 endinpos++;
2085 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002086 goto utf8Error;
2087 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002088 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002089 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2090 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2091
Fredrik Lundh8f455852001-06-27 18:59:43 +00002092#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002093 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002094#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002095 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002096
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002097 /* translate from 10000..10FFFF to 0..FFFF */
2098 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002099
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002100 /* high surrogate = top 10 bits added to D800 */
2101 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002102
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002103 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002104 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002105#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002106 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 }
2108 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002109 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002110
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002111 utf8Error:
2112 outpos = p-PyUnicode_AS_UNICODE(unicode);
2113 if (unicode_decode_call_errorhandler(
2114 errors, &errorHandler,
2115 "utf8", errmsg,
2116 starts, size, &startinpos, &endinpos, &exc, &s,
2117 &unicode, &outpos, &p))
2118 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119 }
Walter Dörwald69652032004-09-07 20:24:22 +00002120 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002121 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122
2123 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002124 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 goto onError;
2126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 Py_XDECREF(errorHandler);
2128 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 return (PyObject *)unicode;
2130
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002131 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002132 Py_XDECREF(errorHandler);
2133 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 Py_DECREF(unicode);
2135 return NULL;
2136}
2137
Tim Peters602f7402002-04-27 18:03:26 +00002138/* Allocation strategy: if the string is short, convert into a stack buffer
2139 and allocate exactly as much space needed at the end. Else allocate the
2140 maximum possible needed (4 result bytes per Unicode character), and return
2141 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002142*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002143PyObject *
2144PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002145 Py_ssize_t size,
2146 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147{
Tim Peters602f7402002-04-27 18:03:26 +00002148#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002149
Martin v. Löwis18e16552006-02-15 17:27:45 +00002150 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002151 PyObject *v; /* result string object */
2152 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002153 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002154 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002155 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002156
Tim Peters602f7402002-04-27 18:03:26 +00002157 assert(s != NULL);
2158 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159
Tim Peters602f7402002-04-27 18:03:26 +00002160 if (size <= MAX_SHORT_UNICHARS) {
2161 /* Write into the stack buffer; nallocated can't overflow.
2162 * At the end, we'll allocate exactly as much heap space as it
2163 * turns out we need.
2164 */
2165 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2166 v = NULL; /* will allocate after we're done */
2167 p = stackbuf;
2168 }
2169 else {
2170 /* Overallocate on the heap, and give the excess back at the end. */
2171 nallocated = size * 4;
2172 if (nallocated / 4 != size) /* overflow! */
2173 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002174 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002175 if (v == NULL)
2176 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002177 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002178 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002179
Tim Peters602f7402002-04-27 18:03:26 +00002180 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002181 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002182
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002183 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002184 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002185 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002186
Guido van Rossumd57fd912000-03-10 22:53:23 +00002187 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002188 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002189 *p++ = (char)(0xc0 | (ch >> 6));
2190 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002191 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002192 else {
Tim Peters602f7402002-04-27 18:03:26 +00002193 /* Encode UCS2 Unicode ordinals */
2194 if (ch < 0x10000) {
2195 /* Special case: check for high surrogate */
2196 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2197 Py_UCS4 ch2 = s[i];
2198 /* Check for low surrogate and combine the two to
2199 form a UCS4 value */
2200 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002201 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002202 i++;
2203 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002204 }
Tim Peters602f7402002-04-27 18:03:26 +00002205 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002206 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002207 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002208 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2209 *p++ = (char)(0x80 | (ch & 0x3f));
2210 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002211 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002212 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002213 /* Encode UCS4 Unicode ordinals */
2214 *p++ = (char)(0xf0 | (ch >> 18));
2215 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2216 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2217 *p++ = (char)(0x80 | (ch & 0x3f));
2218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002220
Tim Peters602f7402002-04-27 18:03:26 +00002221 if (v == NULL) {
2222 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002223 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002224 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002225 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002226 }
2227 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002228 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002229 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002230 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002231 if (_PyString_Resize(&v, nneeded))
2232 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002235
Tim Peters602f7402002-04-27 18:03:26 +00002236#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237}
2238
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2240{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 if (!PyUnicode_Check(unicode)) {
2242 PyErr_BadArgument();
2243 return NULL;
2244 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002245 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002246 PyUnicode_GET_SIZE(unicode),
2247 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248}
2249
Walter Dörwald6e390802007-08-17 16:41:28 +00002250/* --- UTF-32 Codec ------------------------------------------------------- */
2251
2252PyObject *
2253PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002254 Py_ssize_t size,
2255 const char *errors,
2256 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002257{
2258 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2259}
2260
2261PyObject *
2262PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002263 Py_ssize_t size,
2264 const char *errors,
2265 int *byteorder,
2266 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002267{
2268 const char *starts = s;
2269 Py_ssize_t startinpos;
2270 Py_ssize_t endinpos;
2271 Py_ssize_t outpos;
2272 PyUnicodeObject *unicode;
2273 Py_UNICODE *p;
2274#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002275 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002276 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002277#else
2278 const int pairs = 0;
2279#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002280 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002281 int bo = 0; /* assume native ordering by default */
2282 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002283 /* Offsets from q for retrieving bytes in the right order. */
2284#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2285 int iorder[] = {0, 1, 2, 3};
2286#else
2287 int iorder[] = {3, 2, 1, 0};
2288#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002289 PyObject *errorHandler = NULL;
2290 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002291
Walter Dörwald6e390802007-08-17 16:41:28 +00002292 q = (unsigned char *)s;
2293 e = q + size;
2294
2295 if (byteorder)
2296 bo = *byteorder;
2297
2298 /* Check for BOM marks (U+FEFF) in the input and adjust current
2299 byte order setting accordingly. In native mode, the leading BOM
2300 mark is skipped, in all other modes, it is copied to the output
2301 stream as-is (giving a ZWNBSP character). */
2302 if (bo == 0) {
2303 if (size >= 4) {
2304 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002305 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002306#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002307 if (bom == 0x0000FEFF) {
2308 q += 4;
2309 bo = -1;
2310 }
2311 else if (bom == 0xFFFE0000) {
2312 q += 4;
2313 bo = 1;
2314 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002315#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002316 if (bom == 0x0000FEFF) {
2317 q += 4;
2318 bo = 1;
2319 }
2320 else if (bom == 0xFFFE0000) {
2321 q += 4;
2322 bo = -1;
2323 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002324#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002325 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002326 }
2327
2328 if (bo == -1) {
2329 /* force LE */
2330 iorder[0] = 0;
2331 iorder[1] = 1;
2332 iorder[2] = 2;
2333 iorder[3] = 3;
2334 }
2335 else if (bo == 1) {
2336 /* force BE */
2337 iorder[0] = 3;
2338 iorder[1] = 2;
2339 iorder[2] = 1;
2340 iorder[3] = 0;
2341 }
2342
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002343 /* On narrow builds we split characters outside the BMP into two
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002344 code points => count how much extra space we need. */
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002345#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002346 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002347 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2348 pairs++;
2349#endif
2350
2351 /* This might be one to much, because of a BOM */
2352 unicode = _PyUnicode_New((size+3)/4+pairs);
2353 if (!unicode)
2354 return NULL;
2355 if (size == 0)
2356 return (PyObject *)unicode;
2357
2358 /* Unpack UTF-32 encoded data */
2359 p = unicode->str;
2360
Walter Dörwald6e390802007-08-17 16:41:28 +00002361 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002362 Py_UCS4 ch;
2363 /* remaining bytes at the end? (size should be divisible by 4) */
2364 if (e-q<4) {
2365 if (consumed)
2366 break;
2367 errmsg = "truncated data";
2368 startinpos = ((const char *)q)-starts;
2369 endinpos = ((const char *)e)-starts;
2370 goto utf32Error;
2371 /* The remaining input chars are ignored if the callback
2372 chooses to skip the input */
2373 }
2374 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2375 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002376
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002377 if (ch >= 0x110000)
2378 {
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002379 errmsg = "code point not in range(0x110000)";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002380 startinpos = ((const char *)q)-starts;
2381 endinpos = startinpos+4;
2382 goto utf32Error;
2383 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002384#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002385 if (ch >= 0x10000)
2386 {
2387 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2388 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2389 }
2390 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002391#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002392 *p++ = ch;
2393 q += 4;
2394 continue;
2395 utf32Error:
2396 outpos = p-PyUnicode_AS_UNICODE(unicode);
2397 if (unicode_decode_call_errorhandler(
2398 errors, &errorHandler,
2399 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002400 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002401 &unicode, &outpos, &p))
2402 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002403 }
2404
2405 if (byteorder)
2406 *byteorder = bo;
2407
2408 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002409 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002410
2411 /* Adjust length */
2412 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2413 goto onError;
2414
2415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
2417 return (PyObject *)unicode;
2418
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002419 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002420 Py_DECREF(unicode);
2421 Py_XDECREF(errorHandler);
2422 Py_XDECREF(exc);
2423 return NULL;
2424}
2425
2426PyObject *
2427PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002428 Py_ssize_t size,
2429 const char *errors,
2430 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002431{
2432 PyObject *v;
2433 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002434 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002435#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002436 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002437#else
2438 const int pairs = 0;
2439#endif
2440 /* Offsets from p for storing byte pairs in the right order. */
2441#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2442 int iorder[] = {0, 1, 2, 3};
2443#else
2444 int iorder[] = {3, 2, 1, 0};
2445#endif
2446
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002447#define STORECHAR(CH) \
2448 do { \
2449 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2450 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2451 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2452 p[iorder[0]] = (CH) & 0xff; \
2453 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002454 } while(0)
2455
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002456 /* In narrow builds we can output surrogate pairs as one code point,
Walter Dörwald6e390802007-08-17 16:41:28 +00002457 so we need less space. */
2458#ifndef Py_UNICODE_WIDE
2459 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002460 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2461 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2462 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002463#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002464 nsize = (size - pairs + (byteorder == 0));
2465 bytesize = nsize * 4;
2466 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002467 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002468 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002469 if (v == NULL)
2470 return NULL;
2471
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002472 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002473 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002474 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002475 if (size == 0)
2476 return v;
2477
2478 if (byteorder == -1) {
2479 /* force LE */
2480 iorder[0] = 0;
2481 iorder[1] = 1;
2482 iorder[2] = 2;
2483 iorder[3] = 3;
2484 }
2485 else if (byteorder == 1) {
2486 /* force BE */
2487 iorder[0] = 3;
2488 iorder[1] = 2;
2489 iorder[2] = 1;
2490 iorder[3] = 0;
2491 }
2492
2493 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002494 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002495#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002496 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2497 Py_UCS4 ch2 = *s;
2498 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2499 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2500 s++;
2501 size--;
2502 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002503 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002504#endif
2505 STORECHAR(ch);
2506 }
2507 return v;
2508#undef STORECHAR
2509}
2510
2511PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2512{
2513 if (!PyUnicode_Check(unicode)) {
2514 PyErr_BadArgument();
2515 return NULL;
2516 }
2517 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002518 PyUnicode_GET_SIZE(unicode),
2519 NULL,
2520 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002521}
2522
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523/* --- UTF-16 Codec ------------------------------------------------------- */
2524
Tim Peters772747b2001-08-09 22:21:55 +00002525PyObject *
2526PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002527 Py_ssize_t size,
2528 const char *errors,
2529 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530{
Walter Dörwald69652032004-09-07 20:24:22 +00002531 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2532}
2533
2534PyObject *
2535PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002536 Py_ssize_t size,
2537 const char *errors,
2538 int *byteorder,
2539 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002540{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002541 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002542 Py_ssize_t startinpos;
2543 Py_ssize_t endinpos;
2544 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 PyUnicodeObject *unicode;
2546 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002547 const unsigned char *q, *e;
2548 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002549 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002550 /* Offsets from q for retrieving byte pairs in the right order. */
2551#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2552 int ihi = 1, ilo = 0;
2553#else
2554 int ihi = 0, ilo = 1;
2555#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002556 PyObject *errorHandler = NULL;
2557 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558
2559 /* Note: size will always be longer than the resulting Unicode
2560 character count */
2561 unicode = _PyUnicode_New(size);
2562 if (!unicode)
2563 return NULL;
2564 if (size == 0)
2565 return (PyObject *)unicode;
2566
2567 /* Unpack UTF-16 encoded data */
2568 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002569 q = (unsigned char *)s;
2570 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571
2572 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002573 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002575 /* Check for BOM marks (U+FEFF) in the input and adjust current
2576 byte order setting accordingly. In native mode, the leading BOM
2577 mark is skipped, in all other modes, it is copied to the output
2578 stream as-is (giving a ZWNBSP character). */
2579 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002580 if (size >= 2) {
2581 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002582#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002583 if (bom == 0xFEFF) {
2584 q += 2;
2585 bo = -1;
2586 }
2587 else if (bom == 0xFFFE) {
2588 q += 2;
2589 bo = 1;
2590 }
Tim Petersced69f82003-09-16 20:30:58 +00002591#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002592 if (bom == 0xFEFF) {
2593 q += 2;
2594 bo = 1;
2595 }
2596 else if (bom == 0xFFFE) {
2597 q += 2;
2598 bo = -1;
2599 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002600#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002601 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603
Tim Peters772747b2001-08-09 22:21:55 +00002604 if (bo == -1) {
2605 /* force LE */
2606 ihi = 1;
2607 ilo = 0;
2608 }
2609 else if (bo == 1) {
2610 /* force BE */
2611 ihi = 0;
2612 ilo = 1;
2613 }
2614
2615 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002616 Py_UNICODE ch;
2617 /* remaining bytes at the end? (size should be even) */
2618 if (e-q<2) {
2619 if (consumed)
2620 break;
2621 errmsg = "truncated data";
2622 startinpos = ((const char *)q)-starts;
2623 endinpos = ((const char *)e)-starts;
2624 goto utf16Error;
2625 /* The remaining input chars are ignored if the callback
2626 chooses to skip the input */
2627 }
2628 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002629
Benjamin Peterson857ce152009-01-31 16:29:18 +00002630 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002631
2632 if (ch < 0xD800 || ch > 0xDFFF) {
2633 *p++ = ch;
2634 continue;
2635 }
2636
2637 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002638 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002639 q -= 2;
2640 if (consumed)
2641 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002642 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002643 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002644 endinpos = ((const char *)e)-starts;
2645 goto utf16Error;
2646 }
2647 if (0xD800 <= ch && ch <= 0xDBFF) {
2648 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2649 q += 2;
2650 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002651#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002652 *p++ = ch;
2653 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002654#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002655 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002656#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002657 continue;
2658 }
2659 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002660 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002661 startinpos = (((const char *)q)-4)-starts;
2662 endinpos = startinpos+2;
2663 goto utf16Error;
2664 }
2665
Benjamin Peterson857ce152009-01-31 16:29:18 +00002666 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002667 errmsg = "illegal encoding";
2668 startinpos = (((const char *)q)-2)-starts;
2669 endinpos = startinpos+2;
2670 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002671
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002672 utf16Error:
2673 outpos = p-PyUnicode_AS_UNICODE(unicode);
2674 if (unicode_decode_call_errorhandler(
2675 errors, &errorHandler,
2676 "utf16", errmsg,
2677 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2678 &unicode, &outpos, &p))
2679 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 }
2681
2682 if (byteorder)
2683 *byteorder = bo;
2684
Walter Dörwald69652032004-09-07 20:24:22 +00002685 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002686 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002687
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002689 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 goto onError;
2691
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002692 Py_XDECREF(errorHandler);
2693 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 return (PyObject *)unicode;
2695
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002696 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002698 Py_XDECREF(errorHandler);
2699 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 return NULL;
2701}
2702
Tim Peters772747b2001-08-09 22:21:55 +00002703PyObject *
2704PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002705 Py_ssize_t size,
2706 const char *errors,
2707 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708{
2709 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002710 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002711 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002712#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002713 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002714#else
2715 const int pairs = 0;
2716#endif
Tim Peters772747b2001-08-09 22:21:55 +00002717 /* Offsets from p for storing byte pairs in the right order. */
2718#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2719 int ihi = 1, ilo = 0;
2720#else
2721 int ihi = 0, ilo = 1;
2722#endif
2723
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002724#define STORECHAR(CH) \
2725 do { \
2726 p[ihi] = ((CH) >> 8) & 0xff; \
2727 p[ilo] = (CH) & 0xff; \
2728 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002729 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002731#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002732 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002733 if (s[i] >= 0x10000)
2734 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002735#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002736 /* 2 * (size + pairs + (byteorder == 0)) */
2737 if (size > PY_SSIZE_T_MAX ||
2738 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002739 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002740 nsize = size + pairs + (byteorder == 0);
2741 bytesize = nsize * 2;
2742 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002743 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002744 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 if (v == NULL)
2746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002748 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002750 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002751 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002752 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002753
2754 if (byteorder == -1) {
2755 /* force LE */
2756 ihi = 1;
2757 ilo = 0;
2758 }
2759 else if (byteorder == 1) {
2760 /* force BE */
2761 ihi = 0;
2762 ilo = 1;
2763 }
2764
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002765 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002766 Py_UNICODE ch = *s++;
2767 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002768#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002769 if (ch >= 0x10000) {
2770 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2771 ch = 0xD800 | ((ch-0x10000) >> 10);
2772 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002773#endif
Tim Peters772747b2001-08-09 22:21:55 +00002774 STORECHAR(ch);
2775 if (ch2)
2776 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002777 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002779#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780}
2781
2782PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2783{
2784 if (!PyUnicode_Check(unicode)) {
2785 PyErr_BadArgument();
2786 return NULL;
2787 }
2788 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002789 PyUnicode_GET_SIZE(unicode),
2790 NULL,
2791 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792}
2793
2794/* --- Unicode Escape Codec ----------------------------------------------- */
2795
Fredrik Lundh06d12682001-01-24 07:59:11 +00002796static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002797
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002799 Py_ssize_t size,
2800 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002802 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002803 Py_ssize_t startinpos;
2804 Py_ssize_t endinpos;
2805 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002809 char* message;
2810 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811 PyObject *errorHandler = NULL;
2812 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002813
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 /* Escaped strings will always be longer than the resulting
2815 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 length after conversion to the true value.
2817 (but if the error callback returns a long replacement string
2818 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 v = _PyUnicode_New(size);
2820 if (v == NULL)
2821 goto onError;
2822 if (size == 0)
2823 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002824
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002825 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002827
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 while (s < end) {
2829 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002830 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832
2833 /* Non-escape characters are interpreted as Unicode ordinals */
2834 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002835 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 continue;
2837 }
2838
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002839 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 /* \ - Escapes */
2841 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002842 c = *s++;
2843 if (s > end)
2844 c = '\0'; /* Invalid after \ */
2845 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002847 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 case '\n': break;
2849 case '\\': *p++ = '\\'; break;
2850 case '\'': *p++ = '\''; break;
2851 case '\"': *p++ = '\"'; break;
2852 case 'b': *p++ = '\b'; break;
2853 case 'f': *p++ = '\014'; break; /* FF */
2854 case 't': *p++ = '\t'; break;
2855 case 'n': *p++ = '\n'; break;
2856 case 'r': *p++ = '\r'; break;
2857 case 'v': *p++ = '\013'; break; /* VT */
2858 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2859
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002860 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 case '0': case '1': case '2': case '3':
2862 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002863 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002864 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002865 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002866 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002867 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002869 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 break;
2871
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002872 /* hex escapes */
2873 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002875 digits = 2;
2876 message = "truncated \\xXX escape";
2877 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002879 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002881 digits = 4;
2882 message = "truncated \\uXXXX escape";
2883 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002885 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002886 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002887 digits = 8;
2888 message = "truncated \\UXXXXXXXX escape";
2889 hexescape:
2890 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002891 if (end - s < digits) {
2892 /* count only hex digits */
2893 for (; s < end; ++s) {
2894 c = (unsigned char)*s;
2895 if (!Py_ISXDIGIT(c))
2896 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002897 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002898 goto error;
2899 }
2900 for (; digits--; ++s) {
2901 c = (unsigned char)*s;
2902 if (!Py_ISXDIGIT(c))
2903 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002904 chr = (chr<<4) & ~0xF;
2905 if (c >= '0' && c <= '9')
2906 chr += c - '0';
2907 else if (c >= 'a' && c <= 'f')
2908 chr += 10 + c - 'a';
2909 else
2910 chr += 10 + c - 'A';
2911 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002912 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 /* _decoding_error will have already written into the
2914 target buffer. */
2915 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002916 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002917 /* when we get here, chr is a 32-bit unicode character */
2918 if (chr <= 0xffff)
2919 /* UCS-2 character */
2920 *p++ = (Py_UNICODE) chr;
2921 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002922 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002923 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002924#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002925 *p++ = chr;
2926#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002927 chr -= 0x10000L;
2928 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002929 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002930#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002931 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002932 message = "illegal Unicode character";
2933 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002934 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002935 break;
2936
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002937 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002938 case 'N':
2939 message = "malformed \\N character escape";
2940 if (ucnhash_CAPI == NULL) {
2941 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002942 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002943 if (ucnhash_CAPI == NULL)
2944 goto ucnhashError;
2945 }
2946 if (*s == '{') {
2947 const char *start = s+1;
2948 /* look for the closing brace */
2949 while (*s != '}' && s < end)
2950 s++;
2951 if (s > start && s < end && *s == '}') {
2952 /* found a name. look it up in the unicode database */
2953 message = "unknown Unicode character name";
2954 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002955 if (s - start - 1 <= INT_MAX &&
2956 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002957 goto store;
2958 }
2959 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002960 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002961
2962 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002963 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002964 message = "\\ at end of string";
2965 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002966 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002967 }
2968 else {
2969 *p++ = '\\';
2970 *p++ = (unsigned char)s[-1];
2971 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002972 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002974 continue;
2975
2976 error:
2977 endinpos = s-starts;
2978 outpos = p-PyUnicode_AS_UNICODE(v);
2979 if (unicode_decode_call_errorhandler(
2980 errors, &errorHandler,
2981 "unicodeescape", message,
2982 starts, size, &startinpos, &endinpos, &exc, &s,
2983 &v, &outpos, &p))
2984 goto onError;
2985 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002987 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002988 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002989 Py_XDECREF(errorHandler);
2990 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002992
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002993 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002994 PyErr_SetString(
2995 PyExc_UnicodeError,
2996 "\\N escapes not supported (can't load unicodedata module)"
2997 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002998 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002999 Py_XDECREF(errorHandler);
3000 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003001 return NULL;
3002
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003003 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003005 Py_XDECREF(errorHandler);
3006 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007 return NULL;
3008}
3009
3010/* Return a Unicode-Escape string version of the Unicode object.
3011
3012 If quotes is true, the string is enclosed in u"" or u'' quotes as
3013 appropriate.
3014
3015*/
3016
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00003017Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003018 Py_ssize_t size,
3019 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00003020{
3021 /* like wcschr, but doesn't stop at NULL characters */
3022
3023 while (size-- > 0) {
3024 if (*s == ch)
3025 return s;
3026 s++;
3027 }
3028
3029 return NULL;
3030}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003031
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032static
3033PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003034 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 int quotes)
3036{
3037 PyObject *repr;
3038 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003040 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00003041#ifdef Py_UNICODE_WIDE
3042 const Py_ssize_t expandsize = 10;
3043#else
3044 const Py_ssize_t expandsize = 6;
3045#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046
Neal Norwitz17753ec2006-08-21 22:21:19 +00003047 /* XXX(nnorwitz): rather than over-allocating, it would be
3048 better to choose a different scheme. Perhaps scan the
3049 first N-chars of the string and allocate based on that size.
3050 */
3051 /* Initial allocation is based on the longest-possible unichr
3052 escape.
3053
3054 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3055 unichr, so in this case it's the longest unichr escape. In
3056 narrow (UTF-16) builds this is five chars per source unichr
3057 since there are two unichrs in the surrogate pair, so in narrow
3058 (UTF-16) builds it's not the longest unichr escape.
3059
3060 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3061 so in the narrow (UTF-16) build case it's the longest unichr
3062 escape.
3063 */
3064
Neal Norwitze7d8be82008-07-31 17:17:14 +00003065 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003066 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003067
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003068 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003069 2
3070 + expandsize*size
3071 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 if (repr == NULL)
3073 return NULL;
3074
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003075 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076
3077 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003079 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 !findchar(s, size, '"')) ? '"' : '\'';
3081 }
3082 while (size-- > 0) {
3083 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003084
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003085 /* Escape quotes and backslashes */
3086 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003087 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 *p++ = '\\';
3089 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003090 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003091 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003092
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003093#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003094 /* Map 21-bit characters to '\U00xxxxxx' */
3095 else if (ch >= 0x10000) {
3096 *p++ = '\\';
3097 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003098 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3099 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3100 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3101 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3102 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3103 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3104 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003105 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003106 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003107 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003108#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003109 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3110 else if (ch >= 0xD800 && ch < 0xDC00) {
3111 Py_UNICODE ch2;
3112 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003113
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003114 ch2 = *s++;
3115 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003116 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003117 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3118 *p++ = '\\';
3119 *p++ = 'U';
3120 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3121 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3122 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3123 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3124 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3125 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3126 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3127 *p++ = hexdigit[ucs & 0x0000000F];
3128 continue;
3129 }
3130 /* Fall through: isolated surrogates are copied as-is */
3131 s--;
3132 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003133 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003134#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003135
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003137 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138 *p++ = '\\';
3139 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003140 *p++ = hexdigit[(ch >> 12) & 0x000F];
3141 *p++ = hexdigit[(ch >> 8) & 0x000F];
3142 *p++ = hexdigit[(ch >> 4) & 0x000F];
3143 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003145
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003146 /* Map special whitespace to '\t', \n', '\r' */
3147 else if (ch == '\t') {
3148 *p++ = '\\';
3149 *p++ = 't';
3150 }
3151 else if (ch == '\n') {
3152 *p++ = '\\';
3153 *p++ = 'n';
3154 }
3155 else if (ch == '\r') {
3156 *p++ = '\\';
3157 *p++ = 'r';
3158 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003159
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003160 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003161 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003163 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003164 *p++ = hexdigit[(ch >> 4) & 0x000F];
3165 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003166 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003167
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 /* Copy everything else as-is */
3169 else
3170 *p++ = (char) ch;
3171 }
3172 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003173 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174
3175 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003176 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 return repr;
3179}
3180
3181PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003182 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183{
3184 return unicodeescape_string(s, size, 0);
3185}
3186
3187PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3188{
3189 if (!PyUnicode_Check(unicode)) {
3190 PyErr_BadArgument();
3191 return NULL;
3192 }
3193 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003194 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195}
3196
3197/* --- Raw Unicode Escape Codec ------------------------------------------- */
3198
3199PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003200 Py_ssize_t size,
3201 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003203 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003204 Py_ssize_t startinpos;
3205 Py_ssize_t endinpos;
3206 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 const char *end;
3210 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 PyObject *errorHandler = NULL;
3212 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003213
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 /* Escaped strings will always be longer than the resulting
3215 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 length after conversion to the true value. (But decoding error
3217 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 v = _PyUnicode_New(size);
3219 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003220 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003222 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003223 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 end = s + size;
3225 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003226 unsigned char c;
3227 Py_UCS4 x;
3228 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003229 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003231 /* Non-escape characters are interpreted as Unicode ordinals */
3232 if (*s != '\\') {
3233 *p++ = (unsigned char)*s++;
3234 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003235 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003236 startinpos = s-starts;
3237
3238 /* \u-escapes are only interpreted iff the number of leading
3239 backslashes if odd */
3240 bs = s;
3241 for (;s < end;) {
3242 if (*s != '\\')
3243 break;
3244 *p++ = (unsigned char)*s++;
3245 }
3246 if (((s - bs) & 1) == 0 ||
3247 s >= end ||
3248 (*s != 'u' && *s != 'U')) {
3249 continue;
3250 }
3251 p--;
3252 count = *s=='u' ? 4 : 8;
3253 s++;
3254
3255 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3256 outpos = p-PyUnicode_AS_UNICODE(v);
3257 for (x = 0, i = 0; i < count; ++i, ++s) {
3258 c = (unsigned char)*s;
3259 if (!isxdigit(c)) {
3260 endinpos = s-starts;
3261 if (unicode_decode_call_errorhandler(
3262 errors, &errorHandler,
3263 "rawunicodeescape", "truncated \\uXXXX",
3264 starts, size, &startinpos, &endinpos, &exc, &s,
3265 &v, &outpos, &p))
3266 goto onError;
3267 goto nextByte;
3268 }
3269 x = (x<<4) & ~0xF;
3270 if (c >= '0' && c <= '9')
3271 x += c - '0';
3272 else if (c >= 'a' && c <= 'f')
3273 x += 10 + c - 'a';
3274 else
3275 x += 10 + c - 'A';
3276 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003277 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003278 /* UCS-2 character */
3279 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003280 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003281 /* UCS-4 character. Either store directly, or as
3282 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003283#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003284 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003285#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003286 x -= 0x10000L;
3287 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3288 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003289#endif
3290 } else {
3291 endinpos = s-starts;
3292 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003293 if (unicode_decode_call_errorhandler(
3294 errors, &errorHandler,
3295 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003296 starts, size, &startinpos, &endinpos, &exc, &s,
3297 &v, &outpos, &p))
3298 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003299 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003300 nextByte:
3301 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003303 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003304 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003305 Py_XDECREF(errorHandler);
3306 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003308
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003309 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003311 Py_XDECREF(errorHandler);
3312 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 return NULL;
3314}
3315
3316PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003317 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318{
3319 PyObject *repr;
3320 char *p;
3321 char *q;
3322
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003323 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003324#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003325 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003326#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003327 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003328#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003329
Neal Norwitze7d8be82008-07-31 17:17:14 +00003330 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003331 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003332
Neal Norwitze7d8be82008-07-31 17:17:14 +00003333 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 if (repr == NULL)
3335 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003336 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003337 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003339 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 while (size-- > 0) {
3341 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003342#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003343 /* Map 32-bit characters to '\Uxxxxxxxx' */
3344 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003345 *p++ = '\\';
3346 *p++ = 'U';
3347 *p++ = hexdigit[(ch >> 28) & 0xf];
3348 *p++ = hexdigit[(ch >> 24) & 0xf];
3349 *p++ = hexdigit[(ch >> 20) & 0xf];
3350 *p++ = hexdigit[(ch >> 16) & 0xf];
3351 *p++ = hexdigit[(ch >> 12) & 0xf];
3352 *p++ = hexdigit[(ch >> 8) & 0xf];
3353 *p++ = hexdigit[(ch >> 4) & 0xf];
3354 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003355 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003356 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003357#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003358 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3359 if (ch >= 0xD800 && ch < 0xDC00) {
3360 Py_UNICODE ch2;
3361 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003362
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003363 ch2 = *s++;
3364 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003365 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003366 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3367 *p++ = '\\';
3368 *p++ = 'U';
3369 *p++ = hexdigit[(ucs >> 28) & 0xf];
3370 *p++ = hexdigit[(ucs >> 24) & 0xf];
3371 *p++ = hexdigit[(ucs >> 20) & 0xf];
3372 *p++ = hexdigit[(ucs >> 16) & 0xf];
3373 *p++ = hexdigit[(ucs >> 12) & 0xf];
3374 *p++ = hexdigit[(ucs >> 8) & 0xf];
3375 *p++ = hexdigit[(ucs >> 4) & 0xf];
3376 *p++ = hexdigit[ucs & 0xf];
3377 continue;
3378 }
3379 /* Fall through: isolated surrogates are copied as-is */
3380 s--;
3381 size++;
3382 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003383#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003384 /* Map 16-bit characters to '\uxxxx' */
3385 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386 *p++ = '\\';
3387 *p++ = 'u';
3388 *p++ = hexdigit[(ch >> 12) & 0xf];
3389 *p++ = hexdigit[(ch >> 8) & 0xf];
3390 *p++ = hexdigit[(ch >> 4) & 0xf];
3391 *p++ = hexdigit[ch & 15];
3392 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003393 /* Copy everything else as-is */
3394 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395 *p++ = (char) ch;
3396 }
3397 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003398 if (_PyString_Resize(&repr, p - q))
3399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400 return repr;
3401}
3402
3403PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3404{
3405 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003406 PyErr_BadArgument();
3407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408 }
3409 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003410 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411}
3412
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003413/* --- Unicode Internal Codec ------------------------------------------- */
3414
3415PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003416 Py_ssize_t size,
3417 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003418{
3419 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003420 Py_ssize_t startinpos;
3421 Py_ssize_t endinpos;
3422 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003423 PyUnicodeObject *v;
3424 Py_UNICODE *p;
3425 const char *end;
3426 const char *reason;
3427 PyObject *errorHandler = NULL;
3428 PyObject *exc = NULL;
3429
Neal Norwitzd43069c2006-01-08 01:12:10 +00003430#ifdef Py_UNICODE_WIDE
3431 Py_UNICODE unimax = PyUnicode_GetMax();
3432#endif
3433
Armin Rigo7ccbca92006-10-04 12:17:45 +00003434 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003435 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3436 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003437 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003438 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003439 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003440 p = PyUnicode_AS_UNICODE(v);
3441 end = s + size;
3442
3443 while (s < end) {
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003444 if (end-s < Py_UNICODE_SIZE) {
3445 endinpos = end-starts;
3446 reason = "truncated input";
3447 goto error;
3448 }
Neal Norwitz1004a532006-05-15 07:17:23 +00003449 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003450#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003451 /* We have to sanity check the raw data, otherwise doom looms for
3452 some malformed UCS-4 data. */
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003453 if (*p > unimax || *p < 0) {
3454 endinpos = s - starts + Py_UNICODE_SIZE;
3455 reason = "illegal code point (> 0x10FFFF)";
3456 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003457 }
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003458#endif
3459 p++;
3460 s += Py_UNICODE_SIZE;
3461 continue;
3462
3463 error:
3464 startinpos = s - starts;
3465 outpos = p - PyUnicode_AS_UNICODE(v);
3466 if (unicode_decode_call_errorhandler(
3467 errors, &errorHandler,
3468 "unicode_internal", reason,
3469 starts, size, &startinpos, &endinpos, &exc, &s,
3470 &v, &outpos, &p)) {
3471 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003472 }
3473 }
3474
Martin v. Löwis412fb672006-04-13 06:34:32 +00003475 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003476 goto onError;
3477 Py_XDECREF(errorHandler);
3478 Py_XDECREF(exc);
3479 return (PyObject *)v;
3480
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003481 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003482 Py_XDECREF(v);
3483 Py_XDECREF(errorHandler);
3484 Py_XDECREF(exc);
3485 return NULL;
3486}
3487
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488/* --- Latin-1 Codec ------------------------------------------------------ */
3489
3490PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003491 Py_ssize_t size,
3492 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493{
3494 PyUnicodeObject *v;
3495 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003496
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003498 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003499 Py_UNICODE r = *(unsigned char*)s;
3500 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003501 }
3502
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503 v = _PyUnicode_New(size);
3504 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003505 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003507 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508 p = PyUnicode_AS_UNICODE(v);
3509 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003510 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003512
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003513 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 Py_XDECREF(v);
3515 return NULL;
3516}
3517
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518/* create or adjust a UnicodeEncodeError */
3519static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003520 const char *encoding,
3521 const Py_UNICODE *unicode, Py_ssize_t size,
3522 Py_ssize_t startpos, Py_ssize_t endpos,
3523 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003526 *exceptionObject = PyUnicodeEncodeError_Create(
3527 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003528 }
3529 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003530 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3531 goto onError;
3532 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3533 goto onError;
3534 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3535 goto onError;
3536 return;
3537 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02003538 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 }
3540}
3541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542/* raises a UnicodeEncodeError */
3543static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003544 const char *encoding,
3545 const Py_UNICODE *unicode, Py_ssize_t size,
3546 Py_ssize_t startpos, Py_ssize_t endpos,
3547 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548{
3549 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003550 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003552 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553}
3554
3555/* error handling callback helper:
3556 build arguments, call the callback and check the arguments,
3557 put the result into newpos and return the replacement string, which
3558 has to be freed by the caller */
3559static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003560 PyObject **errorHandler,
3561 const char *encoding, const char *reason,
3562 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3563 Py_ssize_t startpos, Py_ssize_t endpos,
3564 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003566 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567
3568 PyObject *restuple;
3569 PyObject *resunicode;
3570
3571 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003572 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003574 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 }
3576
3577 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003578 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003579 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003580 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581
3582 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003583 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003585 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003587 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003588 Py_DECREF(restuple);
3589 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 }
3591 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003592 &resunicode, newpos)) {
3593 Py_DECREF(restuple);
3594 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 }
3596 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003597 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003598 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003599 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3600 Py_DECREF(restuple);
3601 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003602 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 Py_INCREF(resunicode);
3604 Py_DECREF(restuple);
3605 return resunicode;
3606}
3607
3608static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003609 Py_ssize_t size,
3610 const char *errors,
3611 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612{
3613 /* output object */
3614 PyObject *res;
3615 /* pointers to the beginning and end+1 of input */
3616 const Py_UNICODE *startp = p;
3617 const Py_UNICODE *endp = p + size;
3618 /* pointer to the beginning of the unencodable characters */
3619 /* const Py_UNICODE *badp = NULL; */
3620 /* pointer into the output */
3621 char *str;
3622 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003623 Py_ssize_t respos = 0;
3624 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003625 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3626 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 PyObject *errorHandler = NULL;
3628 PyObject *exc = NULL;
3629 /* the following variable is used for caching string comparisons
3630 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3631 int known_errorHandler = -1;
3632
3633 /* allocate enough for a simple encoding without
3634 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003635 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636 if (res == NULL)
3637 goto onError;
3638 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003639 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003640 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641 ressize = size;
3642
3643 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003644 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003646 /* can we encode this? */
3647 if (c<limit) {
3648 /* no overflow check, because we know that the space is enough */
3649 *str++ = (char)c;
3650 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003651 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003652 else {
3653 Py_ssize_t unicodepos = p-startp;
3654 Py_ssize_t requiredsize;
3655 PyObject *repunicode;
3656 Py_ssize_t repsize;
3657 Py_ssize_t newpos;
3658 Py_ssize_t respos;
3659 Py_UNICODE *uni2;
3660 /* startpos for collecting unencodable chars */
3661 const Py_UNICODE *collstart = p;
3662 const Py_UNICODE *collend = p;
3663 /* find all unecodable characters */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003664 while ((collend < endp) && ((*collend) >= limit))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003665 ++collend;
3666 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3667 if (known_errorHandler==-1) {
3668 if ((errors==NULL) || (!strcmp(errors, "strict")))
3669 known_errorHandler = 1;
3670 else if (!strcmp(errors, "replace"))
3671 known_errorHandler = 2;
3672 else if (!strcmp(errors, "ignore"))
3673 known_errorHandler = 3;
3674 else if (!strcmp(errors, "xmlcharrefreplace"))
3675 known_errorHandler = 4;
3676 else
3677 known_errorHandler = 0;
3678 }
3679 switch (known_errorHandler) {
3680 case 1: /* strict */
3681 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3682 goto onError;
3683 case 2: /* replace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003684 while (collstart++ < collend)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003685 *str++ = '?'; /* fall through */
3686 case 3: /* ignore */
3687 p = collend;
3688 break;
3689 case 4: /* xmlcharrefreplace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003690 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003691 /* determine replacement size (temporarily (mis)uses p) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003692 requiredsize = respos;
3693 for (p = collstart; p < collend;) {
Serhiy Storchakae822b032013-08-06 16:56:26 +03003694 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003695 Py_ssize_t incr;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003696 if (ch < 10)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003697 incr = 2+1+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003698 else if (ch < 100)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003699 incr = 2+2+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003700 else if (ch < 1000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003701 incr = 2+3+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003702 else if (ch < 10000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003703 incr = 2+4+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003704 else if (ch < 100000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003705 incr = 2+5+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003706 else if (ch < 1000000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003707 incr = 2+6+1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003708 else
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003709 incr = 2+7+1;
3710 if (requiredsize > PY_SSIZE_T_MAX - incr)
3711 goto overflow;
3712 requiredsize += incr;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003713 }
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003714 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3715 goto overflow;
3716 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003717 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003718 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003719 requiredsize = 2*ressize;
3720 if (_PyString_Resize(&res, requiredsize))
3721 goto onError;
3722 str = PyString_AS_STRING(res) + respos;
3723 ressize = requiredsize;
3724 }
3725 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03003726 for (p = collstart; p < collend;) {
3727 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3728 str += sprintf(str, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003729 }
3730 p = collend;
3731 break;
3732 default:
3733 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3734 encoding, reason, startp, size, &exc,
3735 collstart-startp, collend-startp, &newpos);
3736 if (repunicode == NULL)
3737 goto onError;
3738 /* need more space? (at least enough for what we have+the
3739 replacement+the rest of the string, so we won't have to
3740 check space for encodable characters) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003741 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003742 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003743 if (respos > PY_SSIZE_T_MAX - repsize)
3744 goto overflow;
3745 requiredsize = respos + repsize;
3746 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3747 goto overflow;
3748 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003749 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003750 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003751 requiredsize = 2*ressize;
3752 if (_PyString_Resize(&res, requiredsize)) {
3753 Py_DECREF(repunicode);
3754 goto onError;
3755 }
3756 str = PyString_AS_STRING(res) + respos;
3757 ressize = requiredsize;
3758 }
3759 /* check if there is anything unencodable in the replacement
3760 and copy it to the output */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003761 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003762 c = *uni2;
3763 if (c >= limit) {
3764 raise_encode_exception(&exc, encoding, startp, size,
3765 unicodepos, unicodepos+1, reason);
3766 Py_DECREF(repunicode);
3767 goto onError;
3768 }
3769 *str = (char)c;
3770 }
3771 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003772 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003773 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003774 }
3775 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 /* Resize if we allocated to much */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003777 respos = str - PyString_AS_STRING(res);
3778 if (respos < ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003779 /* If this falls res will be NULL */
3780 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 Py_XDECREF(errorHandler);
3782 Py_XDECREF(exc);
3783 return res;
3784
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003785 overflow:
3786 PyErr_SetString(PyExc_OverflowError,
3787 "encoded result is too long for a Python string");
3788
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003789 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003790 Py_XDECREF(res);
3791 Py_XDECREF(errorHandler);
3792 Py_XDECREF(exc);
3793 return NULL;
3794}
3795
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003797 Py_ssize_t size,
3798 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003800 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801}
3802
3803PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3804{
3805 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003806 PyErr_BadArgument();
3807 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 }
3809 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003810 PyUnicode_GET_SIZE(unicode),
3811 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812}
3813
3814/* --- 7-bit ASCII Codec -------------------------------------------------- */
3815
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003817 Py_ssize_t size,
3818 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 PyUnicodeObject *v;
3822 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003823 Py_ssize_t startinpos;
3824 Py_ssize_t endinpos;
3825 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003826 const char *e;
3827 PyObject *errorHandler = NULL;
3828 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003829
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003831 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003832 Py_UNICODE r = *(unsigned char*)s;
3833 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003834 }
Tim Petersced69f82003-09-16 20:30:58 +00003835
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 v = _PyUnicode_New(size);
3837 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003838 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003840 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003842 e = s + size;
3843 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003844 register unsigned char c = (unsigned char)*s;
3845 if (c < 128) {
3846 *p++ = c;
3847 ++s;
3848 }
3849 else {
3850 startinpos = s-starts;
3851 endinpos = startinpos + 1;
3852 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3853 if (unicode_decode_call_errorhandler(
3854 errors, &errorHandler,
3855 "ascii", "ordinal not in range(128)",
3856 starts, size, &startinpos, &endinpos, &exc, &s,
3857 &v, &outpos, &p))
3858 goto onError;
3859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003861 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003862 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3863 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864 Py_XDECREF(errorHandler);
3865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003867
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003868 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003870 Py_XDECREF(errorHandler);
3871 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872 return NULL;
3873}
3874
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003876 Py_ssize_t size,
3877 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003879 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880}
3881
3882PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3883{
3884 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003885 PyErr_BadArgument();
3886 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887 }
3888 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003889 PyUnicode_GET_SIZE(unicode),
3890 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891}
3892
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003893#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003894
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003895/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003896
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003897#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003898#define NEED_RETRY
3899#endif
3900
3901/* XXX This code is limited to "true" double-byte encodings, as
3902 a) it assumes an incomplete character consists of a single byte, and
3903 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003904 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003905
3906static int is_dbcs_lead_byte(const char *s, int offset)
3907{
3908 const char *curr = s + offset;
3909
3910 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003911 const char *prev = CharPrev(s, curr);
3912 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003913 }
3914 return 0;
3915}
3916
3917/*
3918 * Decode MBCS string into unicode object. If 'final' is set, converts
3919 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3920 */
3921static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003922 const char *s, /* MBCS string */
3923 int size, /* sizeof MBCS string */
3924 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003925{
3926 Py_UNICODE *p;
3927 Py_ssize_t n = 0;
3928 int usize = 0;
3929
3930 assert(size >= 0);
3931
3932 /* Skip trailing lead-byte unless 'final' is set */
3933 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003934 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003935
3936 /* First get the size of the result */
3937 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003938 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3939 if (usize == 0) {
3940 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3941 return -1;
3942 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003943 }
3944
3945 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003946 /* Create unicode object */
3947 *v = _PyUnicode_New(usize);
3948 if (*v == NULL)
3949 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003950 }
3951 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003952 /* Extend unicode object */
3953 n = PyUnicode_GET_SIZE(*v);
3954 if (_PyUnicode_Resize(v, n + usize) < 0)
3955 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003956 }
3957
3958 /* Do the conversion */
3959 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003960 p = PyUnicode_AS_UNICODE(*v) + n;
3961 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3962 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3963 return -1;
3964 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003965 }
3966
3967 return size;
3968}
3969
3970PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003971 Py_ssize_t size,
3972 const char *errors,
3973 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003974{
3975 PyUnicodeObject *v = NULL;
3976 int done;
3977
3978 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003979 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003980
3981#ifdef NEED_RETRY
3982 retry:
3983 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003984 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003985 else
3986#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003987 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003988
3989 if (done < 0) {
3990 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003991 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003992 }
3993
3994 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003995 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003996
3997#ifdef NEED_RETRY
3998 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003999 s += done;
4000 size -= done;
4001 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004002 }
4003#endif
4004
4005 return (PyObject *)v;
4006}
4007
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004008PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004009 Py_ssize_t size,
4010 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004011{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004012 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4013}
4014
4015/*
4016 * Convert unicode into string object (MBCS).
4017 * Returns 0 if succeed, -1 otherwise.
4018 */
4019static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004020 const Py_UNICODE *p, /* unicode */
4021 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004022{
4023 int mbcssize = 0;
4024 Py_ssize_t n = 0;
4025
4026 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004027
4028 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004029 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004030 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4031 if (mbcssize == 0) {
4032 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4033 return -1;
4034 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004035 }
4036
Martin v. Löwisd8251432006-06-14 05:21:04 +00004037 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004038 /* Create string object */
4039 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4040 if (*repr == NULL)
4041 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004042 }
4043 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004044 /* Extend string object */
4045 n = PyString_Size(*repr);
4046 if (_PyString_Resize(repr, n + mbcssize) < 0)
4047 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004048 }
4049
4050 /* Do the conversion */
4051 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004052 char *s = PyString_AS_STRING(*repr) + n;
4053 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4054 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4055 return -1;
4056 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004057 }
4058
4059 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004060}
4061
4062PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004063 Py_ssize_t size,
4064 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004065{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004066 PyObject *repr = NULL;
4067 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004068
Martin v. Löwisd8251432006-06-14 05:21:04 +00004069#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004070 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004071 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004072 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004073 else
4074#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004075 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004076
Martin v. Löwisd8251432006-06-14 05:21:04 +00004077 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004078 Py_XDECREF(repr);
4079 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004080 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004081
4082#ifdef NEED_RETRY
4083 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004084 p += INT_MAX;
4085 size -= INT_MAX;
4086 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004087 }
4088#endif
4089
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004090 return repr;
4091}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004092
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004093PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4094{
4095 if (!PyUnicode_Check(unicode)) {
4096 PyErr_BadArgument();
4097 return NULL;
4098 }
4099 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004100 PyUnicode_GET_SIZE(unicode),
4101 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004102}
4103
Martin v. Löwisd8251432006-06-14 05:21:04 +00004104#undef NEED_RETRY
4105
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004106#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004107
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108/* --- Character Mapping Codec -------------------------------------------- */
4109
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004111 Py_ssize_t size,
4112 PyObject *mapping,
4113 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004116 Py_ssize_t startinpos;
4117 Py_ssize_t endinpos;
4118 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 PyUnicodeObject *v;
4121 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004122 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 PyObject *errorHandler = NULL;
4124 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004125 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004126 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004127
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 /* Default to Latin-1 */
4129 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004130 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131
4132 v = _PyUnicode_New(size);
4133 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004134 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004136 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004139 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004140 mapstring = PyUnicode_AS_UNICODE(mapping);
4141 maplen = PyUnicode_GET_SIZE(mapping);
4142 while (s < e) {
4143 unsigned char ch = *s;
4144 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004146 if (ch < maplen)
4147 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004149 if (x == 0xfffe) {
4150 /* undefined mapping */
4151 outpos = p-PyUnicode_AS_UNICODE(v);
4152 startinpos = s-starts;
4153 endinpos = startinpos+1;
4154 if (unicode_decode_call_errorhandler(
4155 errors, &errorHandler,
4156 "charmap", "character maps to <undefined>",
4157 starts, size, &startinpos, &endinpos, &exc, &s,
4158 &v, &outpos, &p)) {
4159 goto onError;
4160 }
4161 continue;
4162 }
4163 *p++ = x;
4164 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004165 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004166 }
4167 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004168 while (s < e) {
4169 unsigned char ch = *s;
4170 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004171
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004172 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4173 w = PyInt_FromLong((long)ch);
4174 if (w == NULL)
4175 goto onError;
4176 x = PyObject_GetItem(mapping, w);
4177 Py_DECREF(w);
4178 if (x == NULL) {
4179 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4180 /* No mapping found means: mapping is undefined. */
4181 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004182 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004183 } else
4184 goto onError;
4185 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004186
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004187 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004188 if (x == Py_None)
4189 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004190 if (PyInt_Check(x)) {
4191 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004192 if (value == 0xFFFE)
4193 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004194 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004195 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004196 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004197 Py_DECREF(x);
4198 goto onError;
4199 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004200
4201#ifndef Py_UNICODE_WIDE
4202 if (value > 0xFFFF) {
4203 /* see the code for 1-n mapping below */
4204 if (extrachars < 2) {
4205 /* resize first */
4206 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4207 Py_ssize_t needed = 10 - extrachars;
4208 extrachars += needed;
4209 /* XXX overflow detection missing */
4210 if (_PyUnicode_Resize(&v,
4211 PyUnicode_GET_SIZE(v) + needed) < 0) {
4212 Py_DECREF(x);
4213 goto onError;
4214 }
4215 p = PyUnicode_AS_UNICODE(v) + oldpos;
4216 }
4217 value -= 0x10000;
4218 *p++ = 0xD800 | (value >> 10);
4219 *p++ = 0xDC00 | (value & 0x3FF);
4220 extrachars -= 2;
4221 }
4222 else
4223#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004224 *p++ = (Py_UNICODE)value;
4225 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004226 else if (PyUnicode_Check(x)) {
4227 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004228
Serhiy Storchaka95997452013-01-15 14:42:59 +02004229 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004230 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004231 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4232 if (value == 0xFFFE)
4233 goto Undefined;
4234 *p++ = value;
4235 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004236 else if (targetsize > 1) {
4237 /* 1-n mapping */
4238 if (targetsize > extrachars) {
4239 /* resize first */
4240 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4241 Py_ssize_t needed = (targetsize - extrachars) + \
4242 (targetsize << 2);
4243 extrachars += needed;
4244 /* XXX overflow detection missing */
4245 if (_PyUnicode_Resize(&v,
4246 PyUnicode_GET_SIZE(v) + needed) < 0) {
4247 Py_DECREF(x);
4248 goto onError;
4249 }
4250 p = PyUnicode_AS_UNICODE(v) + oldpos;
4251 }
4252 Py_UNICODE_COPY(p,
4253 PyUnicode_AS_UNICODE(x),
4254 targetsize);
4255 p += targetsize;
4256 extrachars -= targetsize;
4257 }
4258 /* 1-0 mapping: skip the character */
4259 }
4260 else {
4261 /* wrong return value */
4262 PyErr_SetString(PyExc_TypeError,
4263 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004264 Py_DECREF(x);
4265 goto onError;
4266 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004267 Py_DECREF(x);
4268 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004269 continue;
4270Undefined:
4271 /* undefined mapping */
4272 Py_XDECREF(x);
4273 outpos = p-PyUnicode_AS_UNICODE(v);
4274 startinpos = s-starts;
4275 endinpos = startinpos+1;
4276 if (unicode_decode_call_errorhandler(
4277 errors, &errorHandler,
4278 "charmap", "character maps to <undefined>",
4279 starts, size, &startinpos, &endinpos, &exc, &s,
4280 &v, &outpos, &p)) {
4281 goto onError;
4282 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 }
4285 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004286 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4287 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288 Py_XDECREF(errorHandler);
4289 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004291
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004292 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293 Py_XDECREF(errorHandler);
4294 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295 Py_XDECREF(v);
4296 return NULL;
4297}
4298
Martin v. Löwis3f767792006-06-04 19:36:28 +00004299/* Charmap encoding: the lookup table */
4300
4301struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004302 PyObject_HEAD
4303 unsigned char level1[32];
4304 int count2, count3;
4305 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004306};
4307
4308static PyObject*
4309encoding_map_size(PyObject *obj, PyObject* args)
4310{
4311 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004312 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004313 128*map->count3);
4314}
4315
4316static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004317 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004318 PyDoc_STR("Return the size (in bytes) of this object") },
4319 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004320};
4321
4322static void
4323encoding_map_dealloc(PyObject* o)
4324{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004325 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004326}
4327
4328static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004329 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004330 "EncodingMap", /*tp_name*/
4331 sizeof(struct encoding_map), /*tp_basicsize*/
4332 0, /*tp_itemsize*/
4333 /* methods */
4334 encoding_map_dealloc, /*tp_dealloc*/
4335 0, /*tp_print*/
4336 0, /*tp_getattr*/
4337 0, /*tp_setattr*/
4338 0, /*tp_compare*/
4339 0, /*tp_repr*/
4340 0, /*tp_as_number*/
4341 0, /*tp_as_sequence*/
4342 0, /*tp_as_mapping*/
4343 0, /*tp_hash*/
4344 0, /*tp_call*/
4345 0, /*tp_str*/
4346 0, /*tp_getattro*/
4347 0, /*tp_setattro*/
4348 0, /*tp_as_buffer*/
4349 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4350 0, /*tp_doc*/
4351 0, /*tp_traverse*/
4352 0, /*tp_clear*/
4353 0, /*tp_richcompare*/
4354 0, /*tp_weaklistoffset*/
4355 0, /*tp_iter*/
4356 0, /*tp_iternext*/
4357 encoding_map_methods, /*tp_methods*/
4358 0, /*tp_members*/
4359 0, /*tp_getset*/
4360 0, /*tp_base*/
4361 0, /*tp_dict*/
4362 0, /*tp_descr_get*/
4363 0, /*tp_descr_set*/
4364 0, /*tp_dictoffset*/
4365 0, /*tp_init*/
4366 0, /*tp_alloc*/
4367 0, /*tp_new*/
4368 0, /*tp_free*/
4369 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004370};
4371
4372PyObject*
4373PyUnicode_BuildEncodingMap(PyObject* string)
4374{
4375 Py_UNICODE *decode;
4376 PyObject *result;
4377 struct encoding_map *mresult;
4378 int i;
4379 int need_dict = 0;
4380 unsigned char level1[32];
4381 unsigned char level2[512];
4382 unsigned char *mlevel1, *mlevel2, *mlevel3;
4383 int count2 = 0, count3 = 0;
4384
4385 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4386 PyErr_BadArgument();
4387 return NULL;
4388 }
4389 decode = PyUnicode_AS_UNICODE(string);
4390 memset(level1, 0xFF, sizeof level1);
4391 memset(level2, 0xFF, sizeof level2);
4392
4393 /* If there isn't a one-to-one mapping of NULL to \0,
4394 or if there are non-BMP characters, we need to use
4395 a mapping dictionary. */
4396 if (decode[0] != 0)
4397 need_dict = 1;
4398 for (i = 1; i < 256; i++) {
4399 int l1, l2;
4400 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004401#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004402 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004403#endif
4404 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004405 need_dict = 1;
4406 break;
4407 }
4408 if (decode[i] == 0xFFFE)
4409 /* unmapped character */
4410 continue;
4411 l1 = decode[i] >> 11;
4412 l2 = decode[i] >> 7;
4413 if (level1[l1] == 0xFF)
4414 level1[l1] = count2++;
4415 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004416 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004417 }
4418
4419 if (count2 >= 0xFF || count3 >= 0xFF)
4420 need_dict = 1;
4421
4422 if (need_dict) {
4423 PyObject *result = PyDict_New();
4424 PyObject *key, *value;
4425 if (!result)
4426 return NULL;
4427 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004428 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004429 key = PyInt_FromLong(decode[i]);
4430 value = PyInt_FromLong(i);
4431 if (!key || !value)
4432 goto failed1;
4433 if (PyDict_SetItem(result, key, value) == -1)
4434 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004435 Py_DECREF(key);
4436 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004437 }
4438 return result;
4439 failed1:
4440 Py_XDECREF(key);
4441 Py_XDECREF(value);
4442 Py_DECREF(result);
4443 return NULL;
4444 }
4445
4446 /* Create a three-level trie */
4447 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4448 16*count2 + 128*count3 - 1);
4449 if (!result)
4450 return PyErr_NoMemory();
4451 PyObject_Init(result, &EncodingMapType);
4452 mresult = (struct encoding_map*)result;
4453 mresult->count2 = count2;
4454 mresult->count3 = count3;
4455 mlevel1 = mresult->level1;
4456 mlevel2 = mresult->level23;
4457 mlevel3 = mresult->level23 + 16*count2;
4458 memcpy(mlevel1, level1, 32);
4459 memset(mlevel2, 0xFF, 16*count2);
4460 memset(mlevel3, 0, 128*count3);
4461 count3 = 0;
4462 for (i = 1; i < 256; i++) {
4463 int o1, o2, o3, i2, i3;
4464 if (decode[i] == 0xFFFE)
4465 /* unmapped character */
4466 continue;
4467 o1 = decode[i]>>11;
4468 o2 = (decode[i]>>7) & 0xF;
4469 i2 = 16*mlevel1[o1] + o2;
4470 if (mlevel2[i2] == 0xFF)
4471 mlevel2[i2] = count3++;
4472 o3 = decode[i] & 0x7F;
4473 i3 = 128*mlevel2[i2] + o3;
4474 mlevel3[i3] = i;
4475 }
4476 return result;
4477}
4478
4479static int
4480encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4481{
4482 struct encoding_map *map = (struct encoding_map*)mapping;
4483 int l1 = c>>11;
4484 int l2 = (c>>7) & 0xF;
4485 int l3 = c & 0x7F;
4486 int i;
4487
4488#ifdef Py_UNICODE_WIDE
4489 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004490 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004491 }
4492#endif
4493 if (c == 0)
4494 return 0;
4495 /* level 1*/
4496 i = map->level1[l1];
4497 if (i == 0xFF) {
4498 return -1;
4499 }
4500 /* level 2*/
4501 i = map->level23[16*i+l2];
4502 if (i == 0xFF) {
4503 return -1;
4504 }
4505 /* level 3 */
4506 i = map->level23[16*map->count2 + 128*i + l3];
4507 if (i == 0) {
4508 return -1;
4509 }
4510 return i;
4511}
4512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513/* Lookup the character ch in the mapping. If the character
4514 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004515 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 PyObject *w = PyInt_FromLong((long)c);
4519 PyObject *x;
4520
4521 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004522 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 x = PyObject_GetItem(mapping, w);
4524 Py_DECREF(w);
4525 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004526 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4527 /* No mapping found means: mapping is undefined. */
4528 PyErr_Clear();
4529 x = Py_None;
4530 Py_INCREF(x);
4531 return x;
4532 } else
4533 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004535 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004536 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004538 long value = PyInt_AS_LONG(x);
4539 if (value < 0 || value > 255) {
4540 PyErr_SetString(PyExc_TypeError,
4541 "character mapping must be in range(256)");
4542 Py_DECREF(x);
4543 return NULL;
4544 }
4545 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004547 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004548 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004550 /* wrong return value */
4551 PyErr_SetString(PyExc_TypeError,
4552 "character mapping must return integer, None or str");
4553 Py_DECREF(x);
4554 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 }
4556}
4557
Martin v. Löwis3f767792006-06-04 19:36:28 +00004558static int
4559charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4560{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004561 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4562 /* exponentially overallocate to minimize reallocations */
4563 if (requiredsize < 2*outsize)
4564 requiredsize = 2*outsize;
4565 if (_PyString_Resize(outobj, requiredsize)) {
4566 return 0;
4567 }
4568 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004569}
4570
Benjamin Peterson857ce152009-01-31 16:29:18 +00004571typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004572 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004573}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574/* lookup the character, put the result in the output string and adjust
4575 various state variables. Reallocate the output string if not enough
4576 space is available. Return a new reference to the object that
4577 was put in the output buffer, or Py_None, if the mapping was undefined
4578 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004579 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004581charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004582 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004584 PyObject *rep;
4585 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004586 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587
Christian Heimese93237d2007-12-19 02:37:44 +00004588 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004589 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004590 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004591 if (res == -1)
4592 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004593 if (outsize<requiredsize)
4594 if (!charmapencode_resize(outobj, outpos, requiredsize))
4595 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004596 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004597 outstart[(*outpos)++] = (char)res;
4598 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004599 }
4600
4601 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004603 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004604 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004605 Py_DECREF(rep);
4606 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004607 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004608 if (PyInt_Check(rep)) {
4609 Py_ssize_t requiredsize = *outpos+1;
4610 if (outsize<requiredsize)
4611 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4612 Py_DECREF(rep);
4613 return enc_EXCEPTION;
4614 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004615 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004616 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004617 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004618 else {
4619 const char *repchars = PyString_AS_STRING(rep);
4620 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4621 Py_ssize_t requiredsize = *outpos+repsize;
4622 if (outsize<requiredsize)
4623 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4624 Py_DECREF(rep);
4625 return enc_EXCEPTION;
4626 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004627 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004628 memcpy(outstart + *outpos, repchars, repsize);
4629 *outpos += repsize;
4630 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631 }
Georg Brandl9f167602006-06-04 21:46:16 +00004632 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004633 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634}
4635
4636/* handle an error in PyUnicode_EncodeCharmap
4637 Return 0 on success, -1 on error */
4638static
4639int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004640 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004642 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004643 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644{
4645 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004646 Py_ssize_t repsize;
4647 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 Py_UNICODE *uni2;
4649 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004650 Py_ssize_t collstartpos = *inpos;
4651 Py_ssize_t collendpos = *inpos+1;
4652 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 char *encoding = "charmap";
4654 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004655 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 /* find all unencodable characters */
4658 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004659 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004660 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004661 int res = encoding_map_lookup(p[collendpos], mapping);
4662 if (res != -1)
4663 break;
4664 ++collendpos;
4665 continue;
4666 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004667
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004668 rep = charmapencode_lookup(p[collendpos], mapping);
4669 if (rep==NULL)
4670 return -1;
4671 else if (rep!=Py_None) {
4672 Py_DECREF(rep);
4673 break;
4674 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004675 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004676 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677 }
4678 /* cache callback name lookup
4679 * (if not done yet, i.e. it's the first error) */
4680 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004681 if ((errors==NULL) || (!strcmp(errors, "strict")))
4682 *known_errorHandler = 1;
4683 else if (!strcmp(errors, "replace"))
4684 *known_errorHandler = 2;
4685 else if (!strcmp(errors, "ignore"))
4686 *known_errorHandler = 3;
4687 else if (!strcmp(errors, "xmlcharrefreplace"))
4688 *known_errorHandler = 4;
4689 else
4690 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 }
4692 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004693 case 1: /* strict */
4694 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4695 return -1;
4696 case 2: /* replace */
4697 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004698 x = charmapencode_output('?', mapping, res, respos);
4699 if (x==enc_EXCEPTION) {
4700 return -1;
4701 }
4702 else if (x==enc_FAILED) {
4703 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4704 return -1;
4705 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004706 }
4707 /* fall through */
4708 case 3: /* ignore */
4709 *inpos = collendpos;
4710 break;
4711 case 4: /* xmlcharrefreplace */
Serhiy Storchakae822b032013-08-06 16:56:26 +03004712 /* generate replacement */
4713 for (collpos = collstartpos; collpos < collendpos;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004714 char buffer[2+29+1+1];
4715 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03004716 Py_UCS4 ch = p[collpos++];
4717#ifndef Py_UNICODE_WIDE
4718 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4719 (collpos < collendpos) &&
4720 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4721 ch = ((((ch & 0x03FF) << 10) |
4722 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4723 }
4724#endif
4725 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004726 for (cp = buffer; *cp; ++cp) {
4727 x = charmapencode_output(*cp, mapping, res, respos);
4728 if (x==enc_EXCEPTION)
4729 return -1;
4730 else if (x==enc_FAILED) {
4731 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4732 return -1;
4733 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004734 }
4735 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004736 *inpos = collendpos;
4737 break;
4738 default:
4739 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004740 encoding, reason, p, size, exceptionObject,
4741 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004742 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004743 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004744 /* generate replacement */
4745 repsize = PyUnicode_GET_SIZE(repunicode);
4746 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004747 x = charmapencode_output(*uni2, mapping, res, respos);
4748 if (x==enc_EXCEPTION) {
4749 return -1;
4750 }
4751 else if (x==enc_FAILED) {
4752 Py_DECREF(repunicode);
4753 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4754 return -1;
4755 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004756 }
4757 *inpos = newpos;
4758 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004759 }
4760 return 0;
4761}
4762
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004764 Py_ssize_t size,
4765 PyObject *mapping,
4766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 /* output object */
4769 PyObject *res = NULL;
4770 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004771 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004773 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 PyObject *errorHandler = NULL;
4775 PyObject *exc = NULL;
4776 /* the following variable is used for caching string comparisons
4777 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4778 * 3=ignore, 4=xmlcharrefreplace */
4779 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780
4781 /* Default to Latin-1 */
4782 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004783 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004785 /* allocate enough for a simple encoding without
4786 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004787 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004788 if (res == NULL)
4789 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004790 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004791 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004793 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004794 /* try to encode it */
4795 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4796 if (x==enc_EXCEPTION) /* error */
4797 goto onError;
4798 if (x==enc_FAILED) { /* unencodable character */
4799 if (charmap_encoding_error(p, size, &inpos, mapping,
4800 &exc,
4801 &known_errorHandler, &errorHandler, errors,
4802 &res, &respos)) {
4803 goto onError;
4804 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004805 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004806 else
4807 /* done with this character => adjust input position */
4808 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004812 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004813 if (_PyString_Resize(&res, respos))
4814 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004815 }
4816 Py_XDECREF(exc);
4817 Py_XDECREF(errorHandler);
4818 return res;
4819
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004820 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 Py_XDECREF(res);
4822 Py_XDECREF(exc);
4823 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 return NULL;
4825}
4826
4827PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004828 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829{
4830 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004831 PyErr_BadArgument();
4832 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 }
4834 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004835 PyUnicode_GET_SIZE(unicode),
4836 mapping,
4837 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838}
4839
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004840/* create or adjust a UnicodeTranslateError */
4841static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004842 const Py_UNICODE *unicode, Py_ssize_t size,
4843 Py_ssize_t startpos, Py_ssize_t endpos,
4844 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004847 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004848 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 }
4850 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004851 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4852 goto onError;
4853 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4854 goto onError;
4855 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4856 goto onError;
4857 return;
4858 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02004859 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860 }
4861}
4862
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863/* raises a UnicodeTranslateError */
4864static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004865 const Py_UNICODE *unicode, Py_ssize_t size,
4866 Py_ssize_t startpos, Py_ssize_t endpos,
4867 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868{
4869 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004870 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004872 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004873}
4874
4875/* error handling callback helper:
4876 build arguments, call the callback and check the arguments,
4877 put the result into newpos and return the replacement string, which
4878 has to be freed by the caller */
4879static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004880 PyObject **errorHandler,
4881 const char *reason,
4882 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4883 Py_ssize_t startpos, Py_ssize_t endpos,
4884 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004886 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887
Martin v. Löwis412fb672006-04-13 06:34:32 +00004888 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 PyObject *restuple;
4890 PyObject *resunicode;
4891
4892 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004893 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004894 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004895 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 }
4897
4898 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004899 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004901 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902
4903 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004904 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004906 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004908 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004909 Py_DECREF(restuple);
4910 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911 }
4912 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004913 &resunicode, &i_newpos)) {
4914 Py_DECREF(restuple);
4915 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004917 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004918 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004919 else
4920 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004921 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004922 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4923 Py_DECREF(restuple);
4924 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004925 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 Py_INCREF(resunicode);
4927 Py_DECREF(restuple);
4928 return resunicode;
4929}
4930
4931/* Lookup the character ch in the mapping and put the result in result,
4932 which must be decrefed by the caller.
4933 Return 0 on success, -1 on error */
4934static
4935int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4936{
4937 PyObject *w = PyInt_FromLong((long)c);
4938 PyObject *x;
4939
4940 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004941 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004942 x = PyObject_GetItem(mapping, w);
4943 Py_DECREF(w);
4944 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004945 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4946 /* No mapping found means: use 1:1 mapping. */
4947 PyErr_Clear();
4948 *result = NULL;
4949 return 0;
4950 } else
4951 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952 }
4953 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004954 *result = x;
4955 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004956 }
4957 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004958 long value = PyInt_AS_LONG(x);
4959 long max = PyUnicode_GetMax();
4960 if (value < 0 || value > max) {
4961 PyErr_Format(PyExc_TypeError,
4962 "character mapping must be in range(0x%lx)", max+1);
4963 Py_DECREF(x);
4964 return -1;
4965 }
4966 *result = x;
4967 return 0;
4968 }
4969 else if (PyUnicode_Check(x)) {
4970 *result = x;
4971 return 0;
4972 }
4973 else {
4974 /* wrong return value */
4975 PyErr_SetString(PyExc_TypeError,
4976 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004977 Py_DECREF(x);
4978 return -1;
4979 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004980}
4981/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004982 if not reallocate and adjust various state variables.
4983 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004984static
Walter Dörwald4894c302003-10-24 14:25:28 +00004985int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004986 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004987{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004988 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004989 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004990 /* remember old output position */
4991 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4992 /* exponentially overallocate to minimize reallocations */
4993 if (requiredsize < 2 * oldsize)
4994 requiredsize = 2 * oldsize;
4995 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4996 return -1;
4997 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004998 }
4999 return 0;
5000}
5001/* lookup the character, put the result in the output string and adjust
5002 various state variables. Return a new reference to the object that
5003 was put in the output buffer in *result, or Py_None, if the mapping was
5004 undefined (in which case no character was written).
5005 The called must decref result.
5006 Return 0 on success, -1 on error. */
5007static
Walter Dörwald4894c302003-10-24 14:25:28 +00005008int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005009 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5010 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011{
Walter Dörwald4894c302003-10-24 14:25:28 +00005012 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005013 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005014 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005015 /* not found => default to 1:1 mapping */
5016 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005017 }
5018 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005019 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005020 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005021 /* no overflow check, because we know that the space is enough */
5022 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023 }
5024 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005025 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5026 if (repsize==1) {
5027 /* no overflow check, because we know that the space is enough */
5028 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5029 }
5030 else if (repsize!=0) {
5031 /* more than one character */
5032 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5033 (insize - (curinp-startinp)) +
5034 repsize - 1;
5035 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5036 return -1;
5037 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5038 *outp += repsize;
5039 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040 }
5041 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005042 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005043 return 0;
5044}
5045
5046PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005047 Py_ssize_t size,
5048 PyObject *mapping,
5049 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005051 /* output object */
5052 PyObject *res = NULL;
5053 /* pointers to the beginning and end+1 of input */
5054 const Py_UNICODE *startp = p;
5055 const Py_UNICODE *endp = p + size;
5056 /* pointer into the output */
5057 Py_UNICODE *str;
5058 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005059 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005060 char *reason = "character maps to <undefined>";
5061 PyObject *errorHandler = NULL;
5062 PyObject *exc = NULL;
5063 /* the following variable is used for caching string comparisons
5064 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5065 * 3=ignore, 4=xmlcharrefreplace */
5066 int known_errorHandler = -1;
5067
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005069 PyErr_BadArgument();
5070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005072
5073 /* allocate enough for a simple 1:1 translation without
5074 replacements, if we need more, we'll resize */
5075 res = PyUnicode_FromUnicode(NULL, size);
5076 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005077 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005079 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005080 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005083 /* try to encode it */
5084 PyObject *x = NULL;
5085 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5086 Py_XDECREF(x);
5087 goto onError;
5088 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005089 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005090 if (x!=Py_None) /* it worked => adjust input pointer */
5091 ++p;
5092 else { /* untranslatable character */
5093 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5094 Py_ssize_t repsize;
5095 Py_ssize_t newpos;
5096 Py_UNICODE *uni2;
5097 /* startpos for collecting untranslatable chars */
5098 const Py_UNICODE *collstart = p;
5099 const Py_UNICODE *collend = p+1;
5100 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005102 /* find all untranslatable characters */
5103 while (collend < endp) {
5104 if (charmaptranslate_lookup(*collend, mapping, &x))
5105 goto onError;
5106 Py_XDECREF(x);
5107 if (x!=Py_None)
5108 break;
5109 ++collend;
5110 }
5111 /* cache callback name lookup
5112 * (if not done yet, i.e. it's the first error) */
5113 if (known_errorHandler==-1) {
5114 if ((errors==NULL) || (!strcmp(errors, "strict")))
5115 known_errorHandler = 1;
5116 else if (!strcmp(errors, "replace"))
5117 known_errorHandler = 2;
5118 else if (!strcmp(errors, "ignore"))
5119 known_errorHandler = 3;
5120 else if (!strcmp(errors, "xmlcharrefreplace"))
5121 known_errorHandler = 4;
5122 else
5123 known_errorHandler = 0;
5124 }
5125 switch (known_errorHandler) {
5126 case 1: /* strict */
5127 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005128 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005129 case 2: /* replace */
5130 /* No need to check for space, this is a 1:1 replacement */
5131 for (coll = collstart; coll<collend; ++coll)
5132 *str++ = '?';
5133 /* fall through */
5134 case 3: /* ignore */
5135 p = collend;
5136 break;
5137 case 4: /* xmlcharrefreplace */
5138 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005139 for (p = collstart; p < collend;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005140 char buffer[2+29+1+1];
5141 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03005142 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5143 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005144 if (charmaptranslate_makespace(&res, &str,
5145 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5146 goto onError;
5147 for (cp = buffer; *cp; ++cp)
5148 *str++ = *cp;
5149 }
5150 p = collend;
5151 break;
5152 default:
5153 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5154 reason, startp, size, &exc,
5155 collstart-startp, collend-startp, &newpos);
5156 if (repunicode == NULL)
5157 goto onError;
5158 /* generate replacement */
5159 repsize = PyUnicode_GET_SIZE(repunicode);
5160 if (charmaptranslate_makespace(&res, &str,
5161 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5162 Py_DECREF(repunicode);
5163 goto onError;
5164 }
5165 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5166 *str++ = *uni2;
5167 p = startp + newpos;
5168 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005169 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005170 }
5171 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005172 /* Resize if we allocated to much */
5173 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005174 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005175 if (PyUnicode_Resize(&res, respos) < 0)
5176 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005177 }
5178 Py_XDECREF(exc);
5179 Py_XDECREF(errorHandler);
5180 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005182 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005183 Py_XDECREF(res);
5184 Py_XDECREF(exc);
5185 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 return NULL;
5187}
5188
5189PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005190 PyObject *mapping,
5191 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192{
5193 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005194
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 str = PyUnicode_FromObject(str);
5196 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005197 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005199 PyUnicode_GET_SIZE(str),
5200 mapping,
5201 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 Py_DECREF(str);
5203 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005204
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005205 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 Py_XDECREF(str);
5207 return NULL;
5208}
Tim Petersced69f82003-09-16 20:30:58 +00005209
Guido van Rossum9e896b32000-04-05 20:11:21 +00005210/* --- Decimal Encoder ---------------------------------------------------- */
5211
5212int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005213 Py_ssize_t length,
5214 char *output,
5215 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005216{
5217 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 PyObject *errorHandler = NULL;
5219 PyObject *exc = NULL;
5220 const char *encoding = "decimal";
5221 const char *reason = "invalid decimal Unicode string";
5222 /* the following variable is used for caching string comparisons
5223 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5224 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005225
5226 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005227 PyErr_BadArgument();
5228 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005229 }
5230
5231 p = s;
5232 end = s + length;
5233 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005234 register Py_UNICODE ch = *p;
5235 int decimal;
5236 PyObject *repunicode;
5237 Py_ssize_t repsize;
5238 Py_ssize_t newpos;
5239 Py_UNICODE *uni2;
5240 Py_UNICODE *collstart;
5241 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005242
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005243 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005244 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005245 ++p;
5246 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005247 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005248 decimal = Py_UNICODE_TODECIMAL(ch);
5249 if (decimal >= 0) {
5250 *output++ = '0' + decimal;
5251 ++p;
5252 continue;
5253 }
5254 if (0 < ch && ch < 256) {
5255 *output++ = (char)ch;
5256 ++p;
5257 continue;
5258 }
5259 /* All other characters are considered unencodable */
5260 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005261 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005262 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005263 Py_UNICODE_ISSPACE(*collend) ||
5264 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005265 break;
5266 }
5267 /* cache callback name lookup
5268 * (if not done yet, i.e. it's the first error) */
5269 if (known_errorHandler==-1) {
5270 if ((errors==NULL) || (!strcmp(errors, "strict")))
5271 known_errorHandler = 1;
5272 else if (!strcmp(errors, "replace"))
5273 known_errorHandler = 2;
5274 else if (!strcmp(errors, "ignore"))
5275 known_errorHandler = 3;
5276 else if (!strcmp(errors, "xmlcharrefreplace"))
5277 known_errorHandler = 4;
5278 else
5279 known_errorHandler = 0;
5280 }
5281 switch (known_errorHandler) {
5282 case 1: /* strict */
5283 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5284 goto onError;
5285 case 2: /* replace */
5286 for (p = collstart; p < collend; ++p)
5287 *output++ = '?';
5288 /* fall through */
5289 case 3: /* ignore */
5290 p = collend;
5291 break;
5292 case 4: /* xmlcharrefreplace */
5293 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005294 for (p = collstart; p < collend;) {
5295 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5296 output += sprintf(output, "&#%d;", ch);
5297 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005298 p = collend;
5299 break;
5300 default:
5301 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5302 encoding, reason, s, length, &exc,
5303 collstart-s, collend-s, &newpos);
5304 if (repunicode == NULL)
5305 goto onError;
5306 /* generate replacement */
5307 repsize = PyUnicode_GET_SIZE(repunicode);
5308 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5309 Py_UNICODE ch = *uni2;
5310 if (Py_UNICODE_ISSPACE(ch))
5311 *output++ = ' ';
5312 else {
5313 decimal = Py_UNICODE_TODECIMAL(ch);
5314 if (decimal >= 0)
5315 *output++ = '0' + decimal;
5316 else if (0 < ch && ch < 256)
5317 *output++ = (char)ch;
5318 else {
5319 Py_DECREF(repunicode);
5320 raise_encode_exception(&exc, encoding,
5321 s, length, collstart-s, collend-s, reason);
5322 goto onError;
5323 }
5324 }
5325 }
5326 p = s + newpos;
5327 Py_DECREF(repunicode);
5328 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005329 }
5330 /* 0-terminate the output string */
5331 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005332 Py_XDECREF(exc);
5333 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005334 return 0;
5335
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005336 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337 Py_XDECREF(exc);
5338 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005339 return -1;
5340}
5341
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342/* --- Helpers ------------------------------------------------------------ */
5343
Eric Smitha9f7d622008-02-17 19:46:49 +00005344#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005345#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005346
5347#include "stringlib/count.h"
5348#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005349#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005350#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005351
Fredrik Lundhc8162812006-05-26 19:33:03 +00005352/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005353#define ADJUST_INDICES(start, end, len) \
5354 if (end > len) \
5355 end = len; \
5356 else if (end < 0) { \
5357 end += len; \
5358 if (end < 0) \
5359 end = 0; \
5360 } \
5361 if (start < 0) { \
5362 start += len; \
5363 if (start < 0) \
5364 start = 0; \
5365 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005366
Martin v. Löwis18e16552006-02-15 17:27:45 +00005367Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005368 PyObject *substr,
5369 Py_ssize_t start,
5370 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005372 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005373 PyUnicodeObject* str_obj;
5374 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005375
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005376 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5377 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005378 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005379 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5380 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005381 Py_DECREF(str_obj);
5382 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 }
Tim Petersced69f82003-09-16 20:30:58 +00005384
Antoine Pitrou64672132010-01-13 07:55:48 +00005385 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005386 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005387 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5388 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005389 );
5390
5391 Py_DECREF(sub_obj);
5392 Py_DECREF(str_obj);
5393
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 return result;
5395}
5396
Martin v. Löwis18e16552006-02-15 17:27:45 +00005397Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005398 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005399 Py_ssize_t start,
5400 Py_ssize_t end,
5401 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005403 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005404
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005405 str = PyUnicode_FromObject(str);
5406 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005407 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005408 sub = PyUnicode_FromObject(sub);
5409 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005410 Py_DECREF(str);
5411 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 }
Tim Petersced69f82003-09-16 20:30:58 +00005413
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005414 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005415 result = stringlib_find_slice(
5416 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5417 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5418 start, end
5419 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005420 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005421 result = stringlib_rfind_slice(
5422 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5423 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5424 start, end
5425 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005426
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005427 Py_DECREF(str);
5428 Py_DECREF(sub);
5429
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 return result;
5431}
5432
Tim Petersced69f82003-09-16 20:30:58 +00005433static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005435 PyUnicodeObject *substring,
5436 Py_ssize_t start,
5437 Py_ssize_t end,
5438 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 if (substring->length == 0)
5441 return 1;
5442
Antoine Pitrou64672132010-01-13 07:55:48 +00005443 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 end -= substring->length;
5445 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005446 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447
5448 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005449 if (Py_UNICODE_MATCH(self, end, substring))
5450 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 } else {
5452 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005453 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 }
5455
5456 return 0;
5457}
5458
Martin v. Löwis18e16552006-02-15 17:27:45 +00005459Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005460 PyObject *substr,
5461 Py_ssize_t start,
5462 Py_ssize_t end,
5463 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005465 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005466
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 str = PyUnicode_FromObject(str);
5468 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005469 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 substr = PyUnicode_FromObject(substr);
5471 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005472 Py_DECREF(str);
5473 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 }
Tim Petersced69f82003-09-16 20:30:58 +00005475
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005477 (PyUnicodeObject *)substr,
5478 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 Py_DECREF(str);
5480 Py_DECREF(substr);
5481 return result;
5482}
5483
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484/* Apply fixfct filter to the Unicode object self and return a
5485 reference to the modified object */
5486
Tim Petersced69f82003-09-16 20:30:58 +00005487static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005489 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490{
5491
5492 PyUnicodeObject *u;
5493
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005494 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005496 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005497
5498 Py_UNICODE_COPY(u->str, self->str, self->length);
5499
Tim Peters7a29bd52001-09-12 03:03:31 +00005500 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005501 /* fixfct should return TRUE if it modified the buffer. If
5502 FALSE, return a reference to the original buffer instead
5503 (to save space, not time) */
5504 Py_INCREF(self);
5505 Py_DECREF(u);
5506 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 }
5508 return (PyObject*) u;
5509}
5510
Tim Petersced69f82003-09-16 20:30:58 +00005511static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512int fixupper(PyUnicodeObject *self)
5513{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005514 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 Py_UNICODE *s = self->str;
5516 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005517
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005519 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005520
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005521 ch = Py_UNICODE_TOUPPER(*s);
5522 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005524 *s = ch;
5525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 s++;
5527 }
5528
5529 return status;
5530}
5531
Tim Petersced69f82003-09-16 20:30:58 +00005532static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533int fixlower(PyUnicodeObject *self)
5534{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005535 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 Py_UNICODE *s = self->str;
5537 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005538
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005540 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005541
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005542 ch = Py_UNICODE_TOLOWER(*s);
5543 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005545 *s = ch;
5546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 s++;
5548 }
5549
5550 return status;
5551}
5552
Tim Petersced69f82003-09-16 20:30:58 +00005553static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554int fixswapcase(PyUnicodeObject *self)
5555{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005556 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 Py_UNICODE *s = self->str;
5558 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005559
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 while (len-- > 0) {
5561 if (Py_UNICODE_ISUPPER(*s)) {
5562 *s = Py_UNICODE_TOLOWER(*s);
5563 status = 1;
5564 } else if (Py_UNICODE_ISLOWER(*s)) {
5565 *s = Py_UNICODE_TOUPPER(*s);
5566 status = 1;
5567 }
5568 s++;
5569 }
5570
5571 return status;
5572}
5573
Tim Petersced69f82003-09-16 20:30:58 +00005574static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575int fixcapitalize(PyUnicodeObject *self)
5576{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005577 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005578 Py_UNICODE *s = self->str;
5579 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005580
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005581 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005582 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005583 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005584 *s = Py_UNICODE_TOUPPER(*s);
5585 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005587 s++;
5588 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005589 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005590 *s = Py_UNICODE_TOLOWER(*s);
5591 status = 1;
5592 }
5593 s++;
5594 }
5595 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596}
5597
5598static
5599int fixtitle(PyUnicodeObject *self)
5600{
5601 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5602 register Py_UNICODE *e;
5603 int previous_is_cased;
5604
5605 /* Shortcut for single character strings */
5606 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005607 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5608 if (*p != ch) {
5609 *p = ch;
5610 return 1;
5611 }
5612 else
5613 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 }
Tim Petersced69f82003-09-16 20:30:58 +00005615
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 e = p + PyUnicode_GET_SIZE(self);
5617 previous_is_cased = 0;
5618 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005619 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005620
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005621 if (previous_is_cased)
5622 *p = Py_UNICODE_TOLOWER(ch);
5623 else
5624 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005625
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005626 if (Py_UNICODE_ISLOWER(ch) ||
5627 Py_UNICODE_ISUPPER(ch) ||
5628 Py_UNICODE_ISTITLE(ch))
5629 previous_is_cased = 1;
5630 else
5631 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 }
5633 return 1;
5634}
5635
Tim Peters8ce9f162004-08-27 01:49:32 +00005636PyObject *
5637PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638{
Tim Peters8ce9f162004-08-27 01:49:32 +00005639 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005640 const Py_UNICODE blank = ' ';
5641 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005642 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005643 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005644 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5645 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005646 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5647 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005648 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005649 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005650 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651
Benjamin Peterson7a91bf82014-02-15 13:02:52 -05005652 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00005653 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005654 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005655 }
5656
Tim Peters91879ab2004-08-27 22:35:44 +00005657 /* Grrrr. A codec may be invoked to convert str objects to
5658 * Unicode, and so it's possible to call back into Python code
5659 * during PyUnicode_FromObject(), and so it's possible for a sick
5660 * codec to change the size of fseq (if seq is a list). Therefore
5661 * we have to keep refetching the size -- can't assume seqlen
5662 * is invariant.
5663 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005664 seqlen = PySequence_Fast_GET_SIZE(fseq);
5665 /* If empty sequence, return u"". */
5666 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005667 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5668 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005669 }
5670 /* If singleton sequence with an exact Unicode, return that. */
5671 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005672 item = PySequence_Fast_GET_ITEM(fseq, 0);
5673 if (PyUnicode_CheckExact(item)) {
5674 Py_INCREF(item);
5675 res = (PyUnicodeObject *)item;
5676 goto Done;
5677 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005678 }
5679
Tim Peters05eba1f2004-08-27 21:32:02 +00005680 /* At least two items to join, or one that isn't exact Unicode. */
5681 if (seqlen > 1) {
5682 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005683 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005684 sep = &blank;
5685 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005686 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005687 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005688 internal_separator = PyUnicode_FromObject(separator);
5689 if (internal_separator == NULL)
5690 goto onError;
5691 sep = PyUnicode_AS_UNICODE(internal_separator);
5692 seplen = PyUnicode_GET_SIZE(internal_separator);
5693 /* In case PyUnicode_FromObject() mutated seq. */
5694 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005695 }
5696 }
5697
5698 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005699 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005700 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005701 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005702 res_p = PyUnicode_AS_UNICODE(res);
5703 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005704
Tim Peters05eba1f2004-08-27 21:32:02 +00005705 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005706 Py_ssize_t itemlen;
5707 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005708
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005709 item = PySequence_Fast_GET_ITEM(fseq, i);
5710 /* Convert item to Unicode. */
5711 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5712 PyErr_Format(PyExc_TypeError,
5713 "sequence item %zd: expected string or Unicode,"
5714 " %.80s found",
5715 i, Py_TYPE(item)->tp_name);
5716 goto onError;
5717 }
5718 item = PyUnicode_FromObject(item);
5719 if (item == NULL)
5720 goto onError;
5721 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005722
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005723 /* In case PyUnicode_FromObject() mutated seq. */
5724 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005725
Tim Peters8ce9f162004-08-27 01:49:32 +00005726 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005727 itemlen = PyUnicode_GET_SIZE(item);
5728 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005729 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005730 goto Overflow;
5731 if (i < seqlen - 1) {
5732 new_res_used += seplen;
5733 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005734 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005735 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005736 if (new_res_used > res_alloc) {
5737 /* double allocated size until it's big enough */
5738 do {
5739 res_alloc += res_alloc;
5740 if (res_alloc <= 0)
5741 goto Overflow;
5742 } while (new_res_used > res_alloc);
5743 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5744 Py_DECREF(item);
5745 goto onError;
5746 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005747 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005748 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005749
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005750 /* Copy item, and maybe the separator. */
5751 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5752 res_p += itemlen;
5753 if (i < seqlen - 1) {
5754 Py_UNICODE_COPY(res_p, sep, seplen);
5755 res_p += seplen;
5756 }
5757 Py_DECREF(item);
5758 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005759 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005760
Tim Peters05eba1f2004-08-27 21:32:02 +00005761 /* Shrink res to match the used area; this probably can't fail,
5762 * but it's cheap to check.
5763 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005764 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005765 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005766
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005767 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005768 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005769 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 return (PyObject *)res;
5771
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005772 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005773 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005774 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005775 Py_DECREF(item);
5776 /* fall through */
5777
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005778 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005779 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005780 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005781 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 return NULL;
5783}
5784
Tim Petersced69f82003-09-16 20:30:58 +00005785static
5786PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005787 Py_ssize_t left,
5788 Py_ssize_t right,
5789 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790{
5791 PyUnicodeObject *u;
5792
5793 if (left < 0)
5794 left = 0;
5795 if (right < 0)
5796 right = 0;
5797
Tim Peters7a29bd52001-09-12 03:03:31 +00005798 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 Py_INCREF(self);
5800 return self;
5801 }
5802
Neal Norwitze7d8be82008-07-31 17:17:14 +00005803 if (left > PY_SSIZE_T_MAX - self->length ||
5804 right > PY_SSIZE_T_MAX - (left + self->length)) {
5805 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5806 return NULL;
5807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808 u = _PyUnicode_New(left + self->length + right);
5809 if (u) {
5810 if (left)
5811 Py_UNICODE_FILL(u->str, fill, left);
5812 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5813 if (right)
5814 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5815 }
5816
5817 return u;
5818}
5819
Antoine Pitrou64672132010-01-13 07:55:48 +00005820PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823
5824 string = PyUnicode_FromObject(string);
5825 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005826 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827
Antoine Pitrou64672132010-01-13 07:55:48 +00005828 list = stringlib_splitlines(
5829 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5830 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831
5832 Py_DECREF(string);
5833 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834}
5835
Tim Petersced69f82003-09-16 20:30:58 +00005836static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005838 PyUnicodeObject *substring,
5839 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005842 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005845 return stringlib_split_whitespace(
5846 (PyObject*) self, self->str, self->length, maxcount
5847 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848
Antoine Pitrou64672132010-01-13 07:55:48 +00005849 return stringlib_split(
5850 (PyObject*) self, self->str, self->length,
5851 substring->str, substring->length,
5852 maxcount
5853 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854}
5855
Tim Petersced69f82003-09-16 20:30:58 +00005856static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005857PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005858 PyUnicodeObject *substring,
5859 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005860{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005861 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005862 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005863
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005864 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005865 return stringlib_rsplit_whitespace(
5866 (PyObject*) self, self->str, self->length, maxcount
5867 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005868
Antoine Pitrou64672132010-01-13 07:55:48 +00005869 return stringlib_rsplit(
5870 (PyObject*) self, self->str, self->length,
5871 substring->str, substring->length,
5872 maxcount
5873 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005874}
5875
5876static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005878 PyUnicodeObject *str1,
5879 PyUnicodeObject *str2,
5880 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
5882 PyUnicodeObject *u;
5883
5884 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005885 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005886 else if (maxcount == 0 || self->length == 0)
5887 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888
Fredrik Lundh347ee272006-05-24 16:35:18 +00005889 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005890 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005891 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005892 if (str1->length == 0)
5893 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005894 if (str1->length == 1) {
5895 /* replace characters */
5896 Py_UNICODE u1, u2;
5897 if (!findchar(self->str, self->length, str1->str[0]))
5898 goto nothing;
5899 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5900 if (!u)
5901 return NULL;
5902 Py_UNICODE_COPY(u->str, self->str, self->length);
5903 u1 = str1->str[0];
5904 u2 = str2->str[0];
5905 for (i = 0; i < u->length; i++)
5906 if (u->str[i] == u1) {
5907 if (--maxcount < 0)
5908 break;
5909 u->str[i] = u2;
5910 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005912 i = stringlib_find(
5913 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005915 if (i < 0)
5916 goto nothing;
5917 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5918 if (!u)
5919 return NULL;
5920 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005921
5922 /* change everything in-place, starting with this one */
5923 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5924 i += str1->length;
5925
5926 while ( --maxcount > 0) {
5927 i = stringlib_find(self->str+i, self->length-i,
5928 str1->str, str1->length,
5929 i);
5930 if (i == -1)
5931 break;
5932 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5933 i += str1->length;
5934 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005937
Brett Cannona7f13ee2010-05-04 01:16:51 +00005938 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005939 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 Py_UNICODE *p;
5941
5942 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005943 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5944 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005945 if (n == 0)
5946 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005947 /* new_size = self->length + n * (str2->length - str1->length)); */
5948 delta = (str2->length - str1->length);
5949 if (delta == 0) {
5950 new_size = self->length;
5951 } else {
5952 product = n * (str2->length - str1->length);
5953 if ((product / (str2->length - str1->length)) != n) {
5954 PyErr_SetString(PyExc_OverflowError,
5955 "replace string is too long");
5956 return NULL;
5957 }
5958 new_size = self->length + product;
5959 if (new_size < 0) {
5960 PyErr_SetString(PyExc_OverflowError,
5961 "replace string is too long");
5962 return NULL;
5963 }
5964 }
5965 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005966 if (!u)
5967 return NULL;
5968 i = 0;
5969 p = u->str;
5970 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005971 while (n-- > 0) {
5972 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005973 j = stringlib_find(self->str+i, self->length-i,
5974 str1->str, str1->length,
5975 i);
5976 if (j == -1)
5977 break;
5978 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005979 /* copy unchanged part [i:j] */
5980 Py_UNICODE_COPY(p, self->str+i, j-i);
5981 p += j - i;
5982 }
5983 /* copy substitution string */
5984 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005985 Py_UNICODE_COPY(p, str2->str, str2->length);
5986 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005987 }
5988 i = j + str1->length;
5989 }
5990 if (i < self->length)
5991 /* copy tail [i:] */
5992 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005993 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005994 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005995 while (n > 0) {
5996 Py_UNICODE_COPY(p, str2->str, str2->length);
5997 p += str2->length;
5998 if (--n <= 0)
5999 break;
6000 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006002 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 }
6004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006006
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006007 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006008 /* nothing to replace; return original string (when possible) */
6009 if (PyUnicode_CheckExact(self)) {
6010 Py_INCREF(self);
6011 return (PyObject *) self;
6012 }
6013 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014}
6015
6016/* --- Unicode Object Methods --------------------------------------------- */
6017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006018PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006019 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020\n\
6021Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006022characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
6024static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006025unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 return fixup(self, fixtitle);
6028}
6029
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006030PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006031 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032\n\
6033Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00006034have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035
6036static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006037unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 return fixup(self, fixcapitalize);
6040}
6041
6042#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006043PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006044 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045\n\
6046Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006047normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048
6049static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006050unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051{
6052 PyObject *list;
6053 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006054 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 /* Split into words */
6057 list = split(self, NULL, -1);
6058 if (!list)
6059 return NULL;
6060
6061 /* Capitalize each word */
6062 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6063 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006064 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 if (item == NULL)
6066 goto onError;
6067 Py_DECREF(PyList_GET_ITEM(list, i));
6068 PyList_SET_ITEM(list, i, item);
6069 }
6070
6071 /* Join the words to form a new string */
6072 item = PyUnicode_Join(NULL, list);
6073
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006074 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 Py_DECREF(list);
6076 return (PyObject *)item;
6077}
6078#endif
6079
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006080/* Argument converter. Coerces to a single unicode character */
6081
6082static int
6083convert_uc(PyObject *obj, void *addr)
6084{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006085 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6086 PyObject *uniobj;
6087 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006088
Benjamin Peterson857ce152009-01-31 16:29:18 +00006089 uniobj = PyUnicode_FromObject(obj);
6090 if (uniobj == NULL) {
6091 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006092 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006093 return 0;
6094 }
6095 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6096 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006097 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006098 Py_DECREF(uniobj);
6099 return 0;
6100 }
6101 unistr = PyUnicode_AS_UNICODE(uniobj);
6102 *fillcharloc = unistr[0];
6103 Py_DECREF(uniobj);
6104 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006105}
6106
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006107PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006108 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006110Return S centered in a Unicode string of length width. Padding is\n\
6111done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112
6113static PyObject *
6114unicode_center(PyUnicodeObject *self, PyObject *args)
6115{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006116 Py_ssize_t marg, left;
6117 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006118 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119
Thomas Woutersde017742006-02-16 19:34:37 +00006120 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 return NULL;
6122
Tim Peters7a29bd52001-09-12 03:03:31 +00006123 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 Py_INCREF(self);
6125 return (PyObject*) self;
6126 }
6127
6128 marg = width - self->length;
6129 left = marg / 2 + (marg & width & 1);
6130
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006131 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132}
6133
Marc-André Lemburge5034372000-08-08 08:04:29 +00006134#if 0
6135
6136/* This code should go into some future Unicode collation support
6137 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006138 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006139
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006140/* speedy UTF-16 code point order comparison */
6141/* gleaned from: */
6142/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6143
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006144static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006145{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006146 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006147 0, 0, 0, 0, 0, 0, 0, 0,
6148 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006149 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006150};
6151
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152static int
6153unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6154{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006155 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006156
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 Py_UNICODE *s1 = str1->str;
6158 Py_UNICODE *s2 = str2->str;
6159
6160 len1 = str1->length;
6161 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006162
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006164 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006165
6166 c1 = *s1++;
6167 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006168
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006169 if (c1 > (1<<11) * 26)
6170 c1 += utf16Fixup[c1>>11];
6171 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006172 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006173 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006174
6175 if (c1 != c2)
6176 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006177
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006178 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 }
6180
6181 return (len1 < len2) ? -1 : (len1 != len2);
6182}
6183
Marc-André Lemburge5034372000-08-08 08:04:29 +00006184#else
6185
6186static int
6187unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6188{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006189 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006190
6191 Py_UNICODE *s1 = str1->str;
6192 Py_UNICODE *s2 = str2->str;
6193
6194 len1 = str1->length;
6195 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006196
Marc-André Lemburge5034372000-08-08 08:04:29 +00006197 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006198 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006199
Fredrik Lundh45714e92001-06-26 16:39:36 +00006200 c1 = *s1++;
6201 c2 = *s2++;
6202
6203 if (c1 != c2)
6204 return (c1 < c2) ? -1 : 1;
6205
Marc-André Lemburge5034372000-08-08 08:04:29 +00006206 len1--; len2--;
6207 }
6208
6209 return (len1 < len2) ? -1 : (len1 != len2);
6210}
6211
6212#endif
6213
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006215 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216{
6217 PyUnicodeObject *u = NULL, *v = NULL;
6218 int result;
6219
6220 /* Coerce the two arguments */
6221 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6222 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006223 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6225 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006226 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227
Thomas Wouters7e474022000-07-16 12:04:32 +00006228 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006230 Py_DECREF(u);
6231 Py_DECREF(v);
6232 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 }
6234
6235 result = unicode_compare(u, v);
6236
6237 Py_DECREF(u);
6238 Py_DECREF(v);
6239 return result;
6240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006241 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 Py_XDECREF(u);
6243 Py_XDECREF(v);
6244 return -1;
6245}
6246
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006247PyObject *PyUnicode_RichCompare(PyObject *left,
6248 PyObject *right,
6249 int op)
6250{
6251 int result;
6252
6253 result = PyUnicode_Compare(left, right);
6254 if (result == -1 && PyErr_Occurred())
6255 goto onError;
6256
6257 /* Convert the return value to a Boolean */
6258 switch (op) {
6259 case Py_EQ:
6260 result = (result == 0);
6261 break;
6262 case Py_NE:
6263 result = (result != 0);
6264 break;
6265 case Py_LE:
6266 result = (result <= 0);
6267 break;
6268 case Py_GE:
6269 result = (result >= 0);
6270 break;
6271 case Py_LT:
6272 result = (result == -1);
6273 break;
6274 case Py_GT:
6275 result = (result == 1);
6276 break;
6277 }
6278 return PyBool_FromLong(result);
6279
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006280 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006281
6282 /* Standard case
6283
6284 Type errors mean that PyUnicode_FromObject() could not convert
6285 one of the arguments (usually the right hand side) to Unicode,
6286 ie. we can't handle the comparison request. However, it is
6287 possible that the other object knows a comparison method, which
6288 is why we return Py_NotImplemented to give the other object a
6289 chance.
6290
6291 */
6292 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6293 PyErr_Clear();
6294 Py_INCREF(Py_NotImplemented);
6295 return Py_NotImplemented;
6296 }
6297 if (op != Py_EQ && op != Py_NE)
6298 return NULL;
6299
6300 /* Equality comparison.
6301
6302 This is a special case: we silence any PyExc_UnicodeDecodeError
6303 and instead turn it into a PyErr_UnicodeWarning.
6304
6305 */
6306 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6307 return NULL;
6308 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006309 if (PyErr_Warn(PyExc_UnicodeWarning,
6310 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006311 "Unicode equal comparison "
6312 "failed to convert both arguments to Unicode - "
6313 "interpreting them as being unequal" :
6314 "Unicode unequal comparison "
6315 "failed to convert both arguments to Unicode - "
6316 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006317 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006318 return NULL;
6319 result = (op == Py_NE);
6320 return PyBool_FromLong(result);
6321}
6322
Guido van Rossum403d68b2000-03-13 15:55:09 +00006323int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006324 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006325{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006326 PyObject *str, *sub;
6327 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006328
6329 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006330 sub = PyUnicode_FromObject(element);
6331 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006332 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006333 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006334
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006335 str = PyUnicode_FromObject(container);
6336 if (!str) {
6337 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006338 return -1;
6339 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006340
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006341 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006342
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006343 Py_DECREF(str);
6344 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006345
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006346 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006347}
6348
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349/* Concat to string or Unicode object giving a new Unicode object. */
6350
6351PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006352 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353{
6354 PyUnicodeObject *u = NULL, *v = NULL, *w;
6355
6356 /* Coerce the two arguments */
6357 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6358 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006359 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6361 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006362 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363
6364 /* Shortcuts */
6365 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006366 Py_DECREF(v);
6367 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 }
6369 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006370 Py_DECREF(u);
6371 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 }
6373
6374 /* Concat the two Unicode strings */
6375 w = _PyUnicode_New(u->length + v->length);
6376 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006377 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 Py_UNICODE_COPY(w->str, u->str, u->length);
6379 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6380
6381 Py_DECREF(u);
6382 Py_DECREF(v);
6383 return (PyObject *)w;
6384
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006385 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 Py_XDECREF(u);
6387 Py_XDECREF(v);
6388 return NULL;
6389}
6390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006391PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006392 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006394Return the number of non-overlapping occurrences of substring sub in\n\
6395Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006396interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397
6398static PyObject *
6399unicode_count(PyUnicodeObject *self, PyObject *args)
6400{
6401 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006402 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006403 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 PyObject *result;
6405
Jesus Cea44e81682011-04-20 16:39:15 +02006406 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6407 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006408 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006409
Antoine Pitrou64672132010-01-13 07:55:48 +00006410 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006411 result = PyInt_FromSsize_t(
6412 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006413 substring->str, substring->length,
6414 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006415 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416
6417 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006418
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 return result;
6420}
6421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006422PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006423 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006425Encodes S using the codec registered for encoding. encoding defaults\n\
6426to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006427handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6429'xmlcharrefreplace' as well as any other name registered with\n\
6430codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431
6432static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006433unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006435 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 char *encoding = NULL;
6437 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006438 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006439
Benjamin Peterson332d7212009-09-18 21:14:55 +00006440 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6441 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006443 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006444 if (v == NULL)
6445 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006446 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006447 PyErr_Format(PyExc_TypeError,
6448 "encoder did not return a string/unicode object "
6449 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006450 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006451 Py_DECREF(v);
6452 return NULL;
6453 }
6454 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006455
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006456 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006457 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006458}
6459
6460PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006461 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006462\n\
6463Decodes S using the codec registered for encoding. encoding defaults\n\
6464to the default encoding. errors may be given to set a different error\n\
6465handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6466a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006467as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006468able to handle UnicodeDecodeErrors.");
6469
6470static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006471unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006472{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006473 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006474 char *encoding = NULL;
6475 char *errors = NULL;
6476 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006477
Benjamin Peterson332d7212009-09-18 21:14:55 +00006478 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6479 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006480 return NULL;
6481 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006482 if (v == NULL)
6483 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006484 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006485 PyErr_Format(PyExc_TypeError,
6486 "decoder did not return a string/unicode object "
6487 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006488 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006489 Py_DECREF(v);
6490 return NULL;
6491 }
6492 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006493
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006494 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496}
6497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006498PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006499 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500\n\
6501Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006502If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503
6504static PyObject*
6505unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6506{
6507 Py_UNICODE *e;
6508 Py_UNICODE *p;
6509 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006510 Py_UNICODE *qe;
6511 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 PyUnicodeObject *u;
6513 int tabsize = 8;
6514
6515 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517
Thomas Wouters7e474022000-07-16 12:04:32 +00006518 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006519 i = 0; /* chars up to and including most recent \n or \r */
6520 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6521 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 for (p = self->str; p < e; p++)
6523 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006524 if (tabsize > 0) {
6525 incr = tabsize - (j % tabsize); /* cannot overflow */
6526 if (j > PY_SSIZE_T_MAX - incr)
6527 goto overflow1;
6528 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006529 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006532 if (j > PY_SSIZE_T_MAX - 1)
6533 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 j++;
6535 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006536 if (i > PY_SSIZE_T_MAX - j)
6537 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006539 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 }
6541 }
6542
Guido van Rossum5bdff602008-03-11 21:18:06 +00006543 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006544 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006545
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 /* Second pass: create output string and fill it */
6547 u = _PyUnicode_New(i + j);
6548 if (!u)
6549 return NULL;
6550
Guido van Rossum5bdff602008-03-11 21:18:06 +00006551 j = 0; /* same as in first pass */
6552 q = u->str; /* next output char */
6553 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554
6555 for (p = self->str; p < e; p++)
6556 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006557 if (tabsize > 0) {
6558 i = tabsize - (j % tabsize);
6559 j += i;
6560 while (i--) {
6561 if (q >= qe)
6562 goto overflow2;
6563 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006564 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006565 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006566 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006567 else {
6568 if (q >= qe)
6569 goto overflow2;
6570 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006571 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 if (*p == '\n' || *p == '\r')
6573 j = 0;
6574 }
6575
6576 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006577
6578 overflow2:
6579 Py_DECREF(u);
6580 overflow1:
6581 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6582 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583}
6584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006585PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006586 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587\n\
6588Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006589such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590arguments start and end are interpreted as in slice notation.\n\
6591\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006592Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593
6594static PyObject *
6595unicode_find(PyUnicodeObject *self, PyObject *args)
6596{
Jesus Cea44e81682011-04-20 16:39:15 +02006597 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006598 Py_ssize_t start;
6599 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006600 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
Jesus Cea44e81682011-04-20 16:39:15 +02006602 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6603 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006606 result = stringlib_find_slice(
6607 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6608 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6609 start, end
6610 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611
6612 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006613
6614 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615}
6616
6617static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006618unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619{
6620 if (index < 0 || index >= self->length) {
6621 PyErr_SetString(PyExc_IndexError, "string index out of range");
6622 return NULL;
6623 }
6624
6625 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6626}
6627
6628static long
6629unicode_hash(PyUnicodeObject *self)
6630{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006631 /* Since Unicode objects compare equal to their ASCII string
6632 counterparts, they should use the individual character values
6633 as basis for their hash value. This is needed to assure that
6634 strings and Unicode objects behave in the same way as
6635 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
Martin v. Löwis18e16552006-02-15 17:27:45 +00006637 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006638 register Py_UNICODE *p;
6639 register long x;
6640
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006641#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006642 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006643#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006645 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006646 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006647 /*
6648 We make the hash of the empty string be 0, rather than using
6649 (prefix ^ suffix), since this slightly obfuscates the hash secret
6650 */
6651 if (len == 0) {
6652 self->hash = 0;
6653 return 0;
6654 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006655 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006656 x = _Py_HashSecret.prefix;
6657 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006658 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006659 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006660 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006661 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006662 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006663 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006664 self->hash = x;
6665 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666}
6667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006668PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006669 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006671Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672
6673static PyObject *
6674unicode_index(PyUnicodeObject *self, PyObject *args)
6675{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006676 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006677 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006678 Py_ssize_t start;
6679 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680
Jesus Cea44e81682011-04-20 16:39:15 +02006681 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6682 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006685 result = stringlib_find_slice(
6686 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6687 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6688 start, end
6689 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
6691 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006692
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693 if (result < 0) {
6694 PyErr_SetString(PyExc_ValueError, "substring not found");
6695 return NULL;
6696 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006697
Martin v. Löwis18e16552006-02-15 17:27:45 +00006698 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699}
6700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006701PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006702 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006704Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006705at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706
6707static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006708unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709{
6710 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6711 register const Py_UNICODE *e;
6712 int cased;
6713
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 /* Shortcut for single character strings */
6715 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006716 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006718 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006719 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006720 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006721
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 e = p + PyUnicode_GET_SIZE(self);
6723 cased = 0;
6724 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006725 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006726
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006727 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6728 return PyBool_FromLong(0);
6729 else if (!cased && Py_UNICODE_ISLOWER(ch))
6730 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006732 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733}
6734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006735PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006736 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006738Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006739at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740
6741static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006742unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743{
6744 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6745 register const Py_UNICODE *e;
6746 int cased;
6747
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 /* Shortcut for single character strings */
6749 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006750 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006752 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006753 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006754 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006755
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 e = p + PyUnicode_GET_SIZE(self);
6757 cased = 0;
6758 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006759 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006760
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006761 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6762 return PyBool_FromLong(0);
6763 else if (!cased && Py_UNICODE_ISUPPER(ch))
6764 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006766 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767}
6768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006769PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006770 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006772Return True if S is a titlecased string and there is at least one\n\
6773character in S, i.e. upper- and titlecase characters may only\n\
6774follow uncased characters and lowercase characters only cased ones.\n\
6775Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776
6777static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006778unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779{
6780 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6781 register const Py_UNICODE *e;
6782 int cased, previous_is_cased;
6783
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 /* Shortcut for single character strings */
6785 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006786 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6787 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006789 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006790 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006791 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006792
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793 e = p + PyUnicode_GET_SIZE(self);
6794 cased = 0;
6795 previous_is_cased = 0;
6796 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006797 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006798
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006799 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6800 if (previous_is_cased)
6801 return PyBool_FromLong(0);
6802 previous_is_cased = 1;
6803 cased = 1;
6804 }
6805 else if (Py_UNICODE_ISLOWER(ch)) {
6806 if (!previous_is_cased)
6807 return PyBool_FromLong(0);
6808 previous_is_cased = 1;
6809 cased = 1;
6810 }
6811 else
6812 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006814 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815}
6816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006817PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006818 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006820Return True if all characters in S are whitespace\n\
6821and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822
6823static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006824unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825{
6826 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6827 register const Py_UNICODE *e;
6828
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 /* Shortcut for single character strings */
6830 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006831 Py_UNICODE_ISSPACE(*p))
6832 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006834 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006835 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006836 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006837
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838 e = p + PyUnicode_GET_SIZE(self);
6839 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006840 if (!Py_UNICODE_ISSPACE(*p))
6841 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006843 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844}
6845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006846PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006847 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006848\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006849Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006850and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006851
6852static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006853unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006854{
6855 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6856 register const Py_UNICODE *e;
6857
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006858 /* Shortcut for single character strings */
6859 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006860 Py_UNICODE_ISALPHA(*p))
6861 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006862
6863 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006864 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006865 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006866
6867 e = p + PyUnicode_GET_SIZE(self);
6868 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006869 if (!Py_UNICODE_ISALPHA(*p))
6870 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006871 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006872 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006873}
6874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006875PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006876 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006877\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006878Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006879and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006880
6881static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006882unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006883{
6884 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6885 register const Py_UNICODE *e;
6886
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006887 /* Shortcut for single character strings */
6888 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006889 Py_UNICODE_ISALNUM(*p))
6890 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006891
6892 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006893 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006894 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006895
6896 e = p + PyUnicode_GET_SIZE(self);
6897 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006898 if (!Py_UNICODE_ISALNUM(*p))
6899 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006900 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006901 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006902}
6903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006904PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006905 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006907Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006908False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909
6910static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006911unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912{
6913 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6914 register const Py_UNICODE *e;
6915
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 /* Shortcut for single character strings */
6917 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006918 Py_UNICODE_ISDECIMAL(*p))
6919 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006921 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006922 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006923 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006924
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 e = p + PyUnicode_GET_SIZE(self);
6926 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006927 if (!Py_UNICODE_ISDECIMAL(*p))
6928 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006930 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931}
6932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006933PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006934 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006936Return True if all characters in S are digits\n\
6937and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938
6939static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006940unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941{
6942 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6943 register const Py_UNICODE *e;
6944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 /* Shortcut for single character strings */
6946 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006947 Py_UNICODE_ISDIGIT(*p))
6948 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006950 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006951 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006952 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006953
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 e = p + PyUnicode_GET_SIZE(self);
6955 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006956 if (!Py_UNICODE_ISDIGIT(*p))
6957 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006959 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960}
6961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006962PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006963 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006965Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006966False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967
6968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006969unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970{
6971 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6972 register const Py_UNICODE *e;
6973
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 /* Shortcut for single character strings */
6975 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006976 Py_UNICODE_ISNUMERIC(*p))
6977 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006979 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006980 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006981 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006982
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 e = p + PyUnicode_GET_SIZE(self);
6984 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006985 if (!Py_UNICODE_ISNUMERIC(*p))
6986 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006988 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989}
6990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006991PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006992 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993\n\
6994Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006995iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996
6997static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006998unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007000 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001}
7002
Martin v. Löwis18e16552006-02-15 17:27:45 +00007003static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004unicode_length(PyUnicodeObject *self)
7005{
7006 return self->length;
7007}
7008
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007009PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007010 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007012Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007013done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014
7015static PyObject *
7016unicode_ljust(PyUnicodeObject *self, PyObject *args)
7017{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007018 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007019 Py_UNICODE fillchar = ' ';
7020
Martin v. Löwis412fb672006-04-13 06:34:32 +00007021 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 return NULL;
7023
Tim Peters7a29bd52001-09-12 03:03:31 +00007024 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 Py_INCREF(self);
7026 return (PyObject*) self;
7027 }
7028
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007029 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030}
7031
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007032PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007033 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007035Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036
7037static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007038unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 return fixup(self, fixlower);
7041}
7042
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007043#define LEFTSTRIP 0
7044#define RIGHTSTRIP 1
7045#define BOTHSTRIP 2
7046
7047/* Arrays indexed by above */
7048static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7049
7050#define STRIPNAME(i) (stripformat[i]+3)
7051
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007052/* externally visible for str.strip(unicode) */
7053PyObject *
7054_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7055{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007056 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7057 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7058 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7059 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7060 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007061
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007062 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007063
Benjamin Peterson857ce152009-01-31 16:29:18 +00007064 i = 0;
7065 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007066 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7067 i++;
7068 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007069 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007070
Benjamin Peterson857ce152009-01-31 16:29:18 +00007071 j = len;
7072 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007073 do {
7074 j--;
7075 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7076 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007077 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007078
Benjamin Peterson857ce152009-01-31 16:29:18 +00007079 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007080 Py_INCREF(self);
7081 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007082 }
7083 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007084 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007085}
7086
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087
7088static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007089do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007091 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7092 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007093
Benjamin Peterson857ce152009-01-31 16:29:18 +00007094 i = 0;
7095 if (striptype != RIGHTSTRIP) {
7096 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7097 i++;
7098 }
7099 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007100
Benjamin Peterson857ce152009-01-31 16:29:18 +00007101 j = len;
7102 if (striptype != LEFTSTRIP) {
7103 do {
7104 j--;
7105 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7106 j++;
7107 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007108
Benjamin Peterson857ce152009-01-31 16:29:18 +00007109 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7110 Py_INCREF(self);
7111 return (PyObject*)self;
7112 }
7113 else
7114 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115}
7116
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007117
7118static PyObject *
7119do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7120{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007121 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007122
Benjamin Peterson857ce152009-01-31 16:29:18 +00007123 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7124 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007125
Benjamin Peterson857ce152009-01-31 16:29:18 +00007126 if (sep != NULL && sep != Py_None) {
7127 if (PyUnicode_Check(sep))
7128 return _PyUnicode_XStrip(self, striptype, sep);
7129 else if (PyString_Check(sep)) {
7130 PyObject *res;
7131 sep = PyUnicode_FromObject(sep);
7132 if (sep==NULL)
7133 return NULL;
7134 res = _PyUnicode_XStrip(self, striptype, sep);
7135 Py_DECREF(sep);
7136 return res;
7137 }
7138 else {
7139 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007140 "%s arg must be None, unicode or str",
7141 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007142 return NULL;
7143 }
7144 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007145
Benjamin Peterson857ce152009-01-31 16:29:18 +00007146 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007147}
7148
7149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007150PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007151 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007152\n\
7153Return a copy of the string S with leading and trailing\n\
7154whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007155If chars is given and not None, remove characters in chars instead.\n\
7156If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007157
7158static PyObject *
7159unicode_strip(PyUnicodeObject *self, PyObject *args)
7160{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007161 if (PyTuple_GET_SIZE(args) == 0)
7162 return do_strip(self, BOTHSTRIP); /* Common case */
7163 else
7164 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007165}
7166
7167
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007168PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007169 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007170\n\
7171Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007172If chars is given and not None, remove characters in chars instead.\n\
7173If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007174
7175static PyObject *
7176unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7177{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007178 if (PyTuple_GET_SIZE(args) == 0)
7179 return do_strip(self, LEFTSTRIP); /* Common case */
7180 else
7181 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007182}
7183
7184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007185PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007186 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007187\n\
7188Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007189If chars is given and not None, remove characters in chars instead.\n\
7190If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007191
7192static PyObject *
7193unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7194{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007195 if (PyTuple_GET_SIZE(args) == 0)
7196 return do_strip(self, RIGHTSTRIP); /* Common case */
7197 else
7198 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007199}
7200
7201
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007203unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204{
7205 PyUnicodeObject *u;
7206 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007207 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007208 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209
7210 if (len < 0)
7211 len = 0;
7212
Tim Peters7a29bd52001-09-12 03:03:31 +00007213 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 /* no repeat, return original string */
7215 Py_INCREF(str);
7216 return (PyObject*) str;
7217 }
Tim Peters8f422462000-09-09 06:13:41 +00007218
7219 /* ensure # of chars needed doesn't overflow int and # of bytes
7220 * needed doesn't overflow size_t
7221 */
7222 nchars = len * str->length;
7223 if (len && nchars / len != str->length) {
7224 PyErr_SetString(PyExc_OverflowError,
7225 "repeated string is too long");
7226 return NULL;
7227 }
7228 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7229 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7230 PyErr_SetString(PyExc_OverflowError,
7231 "repeated string is too long");
7232 return NULL;
7233 }
7234 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 if (!u)
7236 return NULL;
7237
7238 p = u->str;
7239
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007240 if (str->length == 1 && len > 0) {
7241 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007242 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007243 Py_ssize_t done = 0; /* number of characters copied this far */
7244 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007245 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007246 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007247 }
7248 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007249 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007250 Py_UNICODE_COPY(p+done, p, n);
7251 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007252 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254
7255 return (PyObject*) u;
7256}
7257
7258PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007259 PyObject *subobj,
7260 PyObject *replobj,
7261 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262{
7263 PyObject *self;
7264 PyObject *str1;
7265 PyObject *str2;
7266 PyObject *result;
7267
7268 self = PyUnicode_FromObject(obj);
7269 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271 str1 = PyUnicode_FromObject(subobj);
7272 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007273 Py_DECREF(self);
7274 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 }
7276 str2 = PyUnicode_FromObject(replobj);
7277 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007278 Py_DECREF(self);
7279 Py_DECREF(str1);
7280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 }
Tim Petersced69f82003-09-16 20:30:58 +00007282 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007283 (PyUnicodeObject *)str1,
7284 (PyUnicodeObject *)str2,
7285 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286 Py_DECREF(self);
7287 Py_DECREF(str1);
7288 Py_DECREF(str2);
7289 return result;
7290}
7291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007292PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007293 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294\n\
7295Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007296old replaced by new. If the optional argument count is\n\
7297given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298
7299static PyObject*
7300unicode_replace(PyUnicodeObject *self, PyObject *args)
7301{
7302 PyUnicodeObject *str1;
7303 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007304 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305 PyObject *result;
7306
Martin v. Löwis18e16552006-02-15 17:27:45 +00007307 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308 return NULL;
7309 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7310 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007313 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007314 Py_DECREF(str1);
7315 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317
7318 result = replace(self, str1, str2, maxcount);
7319
7320 Py_DECREF(str1);
7321 Py_DECREF(str2);
7322 return result;
7323}
7324
7325static
7326PyObject *unicode_repr(PyObject *unicode)
7327{
7328 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007329 PyUnicode_GET_SIZE(unicode),
7330 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331}
7332
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007333PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007334 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335\n\
7336Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007337such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338arguments start and end are interpreted as in slice notation.\n\
7339\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007340Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341
7342static PyObject *
7343unicode_rfind(PyUnicodeObject *self, PyObject *args)
7344{
Jesus Cea44e81682011-04-20 16:39:15 +02007345 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007346 Py_ssize_t start;
7347 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007348 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349
Jesus Cea44e81682011-04-20 16:39:15 +02007350 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7351 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007354 result = stringlib_rfind_slice(
7355 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7356 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7357 start, end
7358 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359
7360 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007361
7362 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363}
7364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007365PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007366 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007368Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369
7370static PyObject *
7371unicode_rindex(PyUnicodeObject *self, PyObject *args)
7372{
Jesus Cea44e81682011-04-20 16:39:15 +02007373 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007374 Py_ssize_t start;
7375 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007376 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377
Jesus Cea44e81682011-04-20 16:39:15 +02007378 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7379 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007382 result = stringlib_rfind_slice(
7383 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7384 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7385 start, end
7386 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387
7388 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007389
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390 if (result < 0) {
7391 PyErr_SetString(PyExc_ValueError, "substring not found");
7392 return NULL;
7393 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007394 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395}
7396
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007397PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007398 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007400Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007401done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402
7403static PyObject *
7404unicode_rjust(PyUnicodeObject *self, PyObject *args)
7405{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007406 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007407 Py_UNICODE fillchar = ' ';
7408
Martin v. Löwis412fb672006-04-13 06:34:32 +00007409 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410 return NULL;
7411
Tim Peters7a29bd52001-09-12 03:03:31 +00007412 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413 Py_INCREF(self);
7414 return (PyObject*) self;
7415 }
7416
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007417 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418}
7419
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007421unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422{
7423 /* standard clamping */
7424 if (start < 0)
7425 start = 0;
7426 if (end < 0)
7427 end = 0;
7428 if (end > self->length)
7429 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007430 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431 /* full slice, return original string */
7432 Py_INCREF(self);
7433 return (PyObject*) self;
7434 }
7435 if (start > end)
7436 start = end;
7437 /* copy slice */
7438 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007439 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440}
7441
7442PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007443 PyObject *sep,
7444 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445{
7446 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007447
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448 s = PyUnicode_FromObject(s);
7449 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007450 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007451 if (sep != NULL) {
7452 sep = PyUnicode_FromObject(sep);
7453 if (sep == NULL) {
7454 Py_DECREF(s);
7455 return NULL;
7456 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 }
7458
7459 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7460
7461 Py_DECREF(s);
7462 Py_XDECREF(sep);
7463 return result;
7464}
7465
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007466PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007467 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468\n\
7469Return a list of the words in S, using sep as the\n\
7470delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007471splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007472whitespace string is a separator and empty strings are\n\
7473removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474
7475static PyObject*
7476unicode_split(PyUnicodeObject *self, PyObject *args)
7477{
7478 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007479 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480
Martin v. Löwis18e16552006-02-15 17:27:45 +00007481 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 return NULL;
7483
7484 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007485 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007487 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007489 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490}
7491
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007492PyObject *
7493PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7494{
7495 PyObject* str_obj;
7496 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007497 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007498
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007499 str_obj = PyUnicode_FromObject(str_in);
7500 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007501 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007502 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007503 if (!sep_obj) {
7504 Py_DECREF(str_obj);
7505 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007506 }
7507
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007508 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007509 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7510 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7511 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007512
Fredrik Lundhb9479482006-05-26 17:22:38 +00007513 Py_DECREF(sep_obj);
7514 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007515
7516 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007517}
7518
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007519
7520PyObject *
7521PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7522{
7523 PyObject* str_obj;
7524 PyObject* sep_obj;
7525 PyObject* out;
7526
7527 str_obj = PyUnicode_FromObject(str_in);
7528 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007529 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007530 sep_obj = PyUnicode_FromObject(sep_in);
7531 if (!sep_obj) {
7532 Py_DECREF(str_obj);
7533 return NULL;
7534 }
7535
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007536 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007537 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7538 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7539 );
7540
7541 Py_DECREF(sep_obj);
7542 Py_DECREF(str_obj);
7543
7544 return out;
7545}
7546
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007547PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007548 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007549\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007550Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007551the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007552found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007553
7554static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007555unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007556{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007557 return PyUnicode_Partition((PyObject *)self, separator);
7558}
7559
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007560PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007561 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007562\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007563Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007564the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007565separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007566
7567static PyObject*
7568unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7569{
7570 return PyUnicode_RPartition((PyObject *)self, separator);
7571}
7572
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007573PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007574 PyObject *sep,
7575 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007576{
7577 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007578
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007579 s = PyUnicode_FromObject(s);
7580 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007581 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007582 if (sep != NULL) {
7583 sep = PyUnicode_FromObject(sep);
7584 if (sep == NULL) {
7585 Py_DECREF(s);
7586 return NULL;
7587 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007588 }
7589
7590 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7591
7592 Py_DECREF(s);
7593 Py_XDECREF(sep);
7594 return result;
7595}
7596
7597PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007598 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007599\n\
7600Return a list of the words in S, using sep as the\n\
7601delimiter string, starting at the end of the string and\n\
7602working to the front. If maxsplit is given, at most maxsplit\n\
7603splits are done. If sep is not specified, any whitespace string\n\
7604is a separator.");
7605
7606static PyObject*
7607unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7608{
7609 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007610 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007611
Martin v. Löwis18e16552006-02-15 17:27:45 +00007612 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007613 return NULL;
7614
7615 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007616 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007617 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007618 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007619 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007620 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007621}
7622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007623PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007624 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625\n\
7626Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007627Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007628is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629
7630static PyObject*
7631unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7632{
Guido van Rossum86662912000-04-11 15:38:46 +00007633 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634
Guido van Rossum86662912000-04-11 15:38:46 +00007635 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 return NULL;
7637
Guido van Rossum86662912000-04-11 15:38:46 +00007638 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639}
7640
7641static
7642PyObject *unicode_str(PyUnicodeObject *self)
7643{
Fred Drakee4315f52000-05-09 19:53:39 +00007644 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645}
7646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007647PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007648 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649\n\
7650Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007651and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652
7653static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007654unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656 return fixup(self, fixswapcase);
7657}
7658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007659PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007660 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661\n\
7662Return a copy of the string S, where all characters have been mapped\n\
7663through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007664Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7665Unmapped characters are left untouched. Characters mapped to None\n\
7666are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667
7668static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007669unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670{
Tim Petersced69f82003-09-16 20:30:58 +00007671 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007672 self->length,
7673 table,
7674 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675}
7676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007677PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007678 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007680Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681
7682static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007683unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685 return fixup(self, fixupper);
7686}
7687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007688PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007689 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690\n\
Georg Brandl98064072008-09-09 19:26:00 +00007691Pad a numeric string S with zeros on the left, to fill a field\n\
7692of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693
7694static PyObject *
7695unicode_zfill(PyUnicodeObject *self, PyObject *args)
7696{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007697 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 PyUnicodeObject *u;
7699
Martin v. Löwis18e16552006-02-15 17:27:45 +00007700 Py_ssize_t width;
7701 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702 return NULL;
7703
7704 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007705 if (PyUnicode_CheckExact(self)) {
7706 Py_INCREF(self);
7707 return (PyObject*) self;
7708 }
7709 else
7710 return PyUnicode_FromUnicode(
7711 PyUnicode_AS_UNICODE(self),
7712 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007713 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714 }
7715
7716 fill = width - self->length;
7717
7718 u = pad(self, fill, 0, '0');
7719
Walter Dörwald068325e2002-04-15 13:36:47 +00007720 if (u == NULL)
7721 return NULL;
7722
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723 if (u->str[fill] == '+' || u->str[fill] == '-') {
7724 /* move sign to beginning of string */
7725 u->str[0] = u->str[fill];
7726 u->str[fill] = '0';
7727 }
7728
7729 return (PyObject*) u;
7730}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731
7732#if 0
7733static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007734free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007736 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737}
7738#endif
7739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007740PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007741 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007743Return True if S starts with the specified prefix, False otherwise.\n\
7744With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007745With optional end, stop comparing S at that position.\n\
7746prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747
7748static PyObject *
7749unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007750 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751{
Georg Brandl24250812006-06-09 18:45:48 +00007752 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007754 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007755 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007756 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757
Jesus Cea44e81682011-04-20 16:39:15 +02007758 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007759 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007760 if (PyTuple_Check(subobj)) {
7761 Py_ssize_t i;
7762 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7763 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007764 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007765 if (substring == NULL)
7766 return NULL;
7767 result = tailmatch(self, substring, start, end, -1);
7768 Py_DECREF(substring);
7769 if (result) {
7770 Py_RETURN_TRUE;
7771 }
7772 }
7773 /* nothing matched */
7774 Py_RETURN_FALSE;
7775 }
7776 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007777 if (substring == NULL) {
7778 if (PyErr_ExceptionMatches(PyExc_TypeError))
7779 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7780 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007781 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007782 }
Georg Brandl24250812006-06-09 18:45:48 +00007783 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007785 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786}
7787
7788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007789PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007790 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007792Return True if S ends with the specified suffix, False otherwise.\n\
7793With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007794With optional end, stop comparing S at that position.\n\
7795suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796
7797static PyObject *
7798unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007799 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800{
Georg Brandl24250812006-06-09 18:45:48 +00007801 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007803 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007804 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007805 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806
Jesus Cea44e81682011-04-20 16:39:15 +02007807 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007808 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007809 if (PyTuple_Check(subobj)) {
7810 Py_ssize_t i;
7811 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7812 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007813 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007814 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007815 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007816 result = tailmatch(self, substring, start, end, +1);
7817 Py_DECREF(substring);
7818 if (result) {
7819 Py_RETURN_TRUE;
7820 }
7821 }
7822 Py_RETURN_FALSE;
7823 }
7824 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007825 if (substring == NULL) {
7826 if (PyErr_ExceptionMatches(PyExc_TypeError))
7827 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7828 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007829 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007830 }
Georg Brandl24250812006-06-09 18:45:48 +00007831 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007833 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834}
7835
7836
Eric Smitha9f7d622008-02-17 19:46:49 +00007837/* Implements do_string_format, which is unicode because of stringlib */
7838#include "stringlib/string_format.h"
7839
7840PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007841 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007842\n\
Eric Smith6c840852010-11-06 19:43:44 +00007843Return a formatted version of S, using substitutions from args and kwargs.\n\
7844The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007845
Eric Smithdc13b792008-05-30 18:10:04 +00007846static PyObject *
7847unicode__format__(PyObject *self, PyObject *args)
7848{
7849 PyObject *format_spec;
7850 PyObject *result = NULL;
7851 PyObject *tmp = NULL;
7852
7853 /* If 2.x, convert format_spec to the same type as value */
7854 /* This is to allow things like u''.format('') */
7855 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7856 goto done;
7857 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7858 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007859 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007860 goto done;
7861 }
7862 tmp = PyObject_Unicode(format_spec);
7863 if (tmp == NULL)
7864 goto done;
7865 format_spec = tmp;
7866
7867 result = _PyUnicode_FormatAdvanced(self,
7868 PyUnicode_AS_UNICODE(format_spec),
7869 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007870 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007871 Py_XDECREF(tmp);
7872 return result;
7873}
7874
Eric Smitha9f7d622008-02-17 19:46:49 +00007875PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007876 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007877\n\
Eric Smith6c840852010-11-06 19:43:44 +00007878Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007879
Robert Schuppenies901c9972008-06-10 10:10:31 +00007880static PyObject *
7881unicode__sizeof__(PyUnicodeObject *v)
7882{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007883 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7884 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007885}
7886
7887PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007888 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007889\n\
7890");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007891
7892static PyObject *
7893unicode_getnewargs(PyUnicodeObject *v)
7894{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007895 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007896}
7897
7898
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007900 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007901 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7902 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007903 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007904 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7905 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7906 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7907 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7908 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7909 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7910 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007911 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007912 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7913 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7914 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007915 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007916 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007917/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7918 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7919 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7920 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007921 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007922 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007923 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007924 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007925 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7926 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7927 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7928 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7929 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7930 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7931 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7932 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7933 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7934 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7935 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7936 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7937 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7938 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007939 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007940 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7941 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7942 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7943 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007944 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007945#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007946 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947#endif
7948
7949#if 0
7950 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007951 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952#endif
7953
Benjamin Peterson857ce152009-01-31 16:29:18 +00007954 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 {NULL, NULL}
7956};
7957
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007958static PyObject *
7959unicode_mod(PyObject *v, PyObject *w)
7960{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007961 if (!PyUnicode_Check(v)) {
7962 Py_INCREF(Py_NotImplemented);
7963 return Py_NotImplemented;
7964 }
7965 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007966}
7967
7968static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007969 0, /*nb_add*/
7970 0, /*nb_subtract*/
7971 0, /*nb_multiply*/
7972 0, /*nb_divide*/
7973 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007974};
7975
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007977 (lenfunc) unicode_length, /* sq_length */
7978 PyUnicode_Concat, /* sq_concat */
7979 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7980 (ssizeargfunc) unicode_getitem, /* sq_item */
7981 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7982 0, /* sq_ass_item */
7983 0, /* sq_ass_slice */
7984 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985};
7986
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007987static PyObject*
7988unicode_subscript(PyUnicodeObject* self, PyObject* item)
7989{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007990 if (PyIndex_Check(item)) {
7991 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007992 if (i == -1 && PyErr_Occurred())
7993 return NULL;
7994 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007995 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007996 return unicode_getitem(self, i);
7997 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007998 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007999 Py_UNICODE* source_buf;
8000 Py_UNICODE* result_buf;
8001 PyObject* result;
8002
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008003 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008004 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008005 return NULL;
8006 }
8007
8008 if (slicelength <= 0) {
8009 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008010 } else if (start == 0 && step == 1 && slicelength == self->length &&
8011 PyUnicode_CheckExact(self)) {
8012 Py_INCREF(self);
8013 return (PyObject *)self;
8014 } else if (step == 1) {
8015 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008016 } else {
8017 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008018 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8019 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008020
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008021 if (result_buf == NULL)
8022 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008023
8024 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8025 result_buf[i] = source_buf[cur];
8026 }
Tim Petersced69f82003-09-16 20:30:58 +00008027
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008028 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008029 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008030 return result;
8031 }
8032 } else {
8033 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8034 return NULL;
8035 }
8036}
8037
8038static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008039 (lenfunc)unicode_length, /* mp_length */
8040 (binaryfunc)unicode_subscript, /* mp_subscript */
8041 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008042};
8043
Martin v. Löwis18e16552006-02-15 17:27:45 +00008044static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008046 Py_ssize_t index,
8047 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048{
8049 if (index != 0) {
8050 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008051 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 return -1;
8053 }
8054 *ptr = (void *) self->str;
8055 return PyUnicode_GET_DATA_SIZE(self);
8056}
8057
Martin v. Löwis18e16552006-02-15 17:27:45 +00008058static Py_ssize_t
8059unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008060 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061{
8062 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008063 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 return -1;
8065}
8066
8067static int
8068unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008069 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070{
8071 if (lenp)
8072 *lenp = PyUnicode_GET_DATA_SIZE(self);
8073 return 1;
8074}
8075
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008076static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008078 Py_ssize_t index,
8079 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080{
8081 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008082
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 if (index != 0) {
8084 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008085 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086 return -1;
8087 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008088 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008090 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008091 *ptr = (void *) PyString_AS_STRING(str);
8092 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093}
8094
8095/* Helpers for PyUnicode_Format() */
8096
8097static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008098getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008100 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008102 (*p_argidx)++;
8103 if (arglen < 0)
8104 return args;
8105 else
8106 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 }
8108 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008109 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 return NULL;
8111}
8112
8113#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008114#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008116#define F_ALT (1<<3)
8117#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118
Martin v. Löwis18e16552006-02-15 17:27:45 +00008119static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008120strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008122 register Py_ssize_t i;
8123 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008125 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127 return len;
8128}
8129
Neal Norwitzfc76d632006-01-10 06:03:13 +00008130static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008131longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8132{
Tim Peters15231542006-02-16 01:08:01 +00008133 Py_ssize_t result;
8134
Neal Norwitzfc76d632006-01-10 06:03:13 +00008135 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008136 result = strtounicode(buffer, (char *)buffer);
8137 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008138}
8139
Guido van Rossum078151d2002-08-11 04:24:12 +00008140/* XXX To save some code duplication, formatfloat/long/int could have been
8141 shared with stringobject.c, converting from 8-bit to Unicode after the
8142 formatting is done. */
8143
Mark Dickinson18cfada2009-11-23 18:46:41 +00008144/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8145
8146static PyObject *
8147formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008149 char *p;
8150 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008152
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 x = PyFloat_AsDouble(v);
8154 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008155 return NULL;
8156
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008158 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008159
Mark Dickinson18cfada2009-11-23 18:46:41 +00008160 p = PyOS_double_to_string(x, type, prec,
8161 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8162 if (p == NULL)
8163 return NULL;
8164 result = PyUnicode_FromStringAndSize(p, strlen(p));
8165 PyMem_Free(p);
8166 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167}
8168
Tim Peters38fd5b62000-09-21 05:43:11 +00008169static PyObject*
8170formatlong(PyObject *val, int flags, int prec, int type)
8171{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008172 char *buf;
8173 int i, len;
8174 PyObject *str; /* temporary string object. */
8175 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008176
Benjamin Peterson857ce152009-01-31 16:29:18 +00008177 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8178 if (!str)
8179 return NULL;
8180 result = _PyUnicode_New(len);
8181 if (!result) {
8182 Py_DECREF(str);
8183 return NULL;
8184 }
8185 for (i = 0; i < len; i++)
8186 result->str[i] = buf[i];
8187 result->str[len] = 0;
8188 Py_DECREF(str);
8189 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008190}
8191
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192static int
8193formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008194 size_t buflen,
8195 int flags,
8196 int prec,
8197 int type,
8198 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008200 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008201 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8202 * + 1 + 1
8203 * = 24
8204 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008205 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008206 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207 long x;
8208
8209 x = PyInt_AsLong(v);
8210 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008211 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008212 if (x < 0 && type == 'u') {
8213 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008214 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008215 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8216 sign = "-";
8217 else
8218 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008220 prec = 1;
8221
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008222 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8223 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008224 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008225 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008226 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008227 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008228 return -1;
8229 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008230
8231 if ((flags & F_ALT) &&
8232 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008233 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008234 * of issues that cause pain:
8235 * - when 0 is being converted, the C standard leaves off
8236 * the '0x' or '0X', which is inconsistent with other
8237 * %#x/%#X conversions and inconsistent with Python's
8238 * hex() function
8239 * - there are platforms that violate the standard and
8240 * convert 0 with the '0x' or '0X'
8241 * (Metrowerks, Compaq Tru64)
8242 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008243 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008244 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008245 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008246 * We can achieve the desired consistency by inserting our
8247 * own '0x' or '0X' prefix, and substituting %x/%X in place
8248 * of %#x/%#X.
8249 *
8250 * Note that this is the same approach as used in
8251 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008252 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008253 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8254 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008255 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008256 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008257 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8258 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008259 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008260 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008261 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008262 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008263 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008264 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265}
8266
8267static int
8268formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008269 size_t buflen,
8270 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271{
Ezio Melotti32125152010-02-25 17:36:04 +00008272 PyObject *unistr;
8273 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008274 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008275 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008276 if (PyUnicode_GET_SIZE(v) != 1)
8277 goto onError;
8278 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008279 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008281 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008282 if (PyString_GET_SIZE(v) != 1)
8283 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008284 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8285 with a UnicodeDecodeError if 'char' is not decodable with the
8286 default encoding (usually ASCII, but it might be something else) */
8287 str = PyString_AS_STRING(v);
8288 if ((unsigned char)str[0] > 0x7F) {
8289 /* the char is not ASCII; try to decode the string using the
8290 default encoding and return -1 to let the UnicodeDecodeError
8291 be raised if the string can't be decoded */
8292 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8293 if (unistr == NULL)
8294 return -1;
8295 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8296 Py_DECREF(unistr);
8297 }
8298 else
8299 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008300 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301
8302 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008303 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008305 x = PyInt_AsLong(v);
8306 if (x == -1 && PyErr_Occurred())
8307 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008308#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008309 if (x < 0 || x > 0x10ffff) {
8310 PyErr_SetString(PyExc_OverflowError,
8311 "%c arg not in range(0x110000) "
8312 "(wide Python build)");
8313 return -1;
8314 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008315#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008316 if (x < 0 || x > 0xffff) {
8317 PyErr_SetString(PyExc_OverflowError,
8318 "%c arg not in range(0x10000) "
8319 "(narrow Python build)");
8320 return -1;
8321 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008322#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008323 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 }
8325 buf[1] = '\0';
8326 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008327
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008328 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008329 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008330 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008331 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332}
8333
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008334/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8335
Mark Dickinson18cfada2009-11-23 18:46:41 +00008336 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008337 chars are formatted. XXX This is a magic number. Each formatting
8338 routine does bounds checking to ensure no overflow, but a better
8339 solution may be to malloc a buffer of appropriate size for each
8340 format. For now, the current solution is sufficient.
8341*/
8342#define FORMATBUFLEN (size_t)120
8343
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008345 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346{
8347 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008348 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 int args_owned = 0;
8350 PyUnicodeObject *result = NULL;
8351 PyObject *dict = NULL;
8352 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008353
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008355 PyErr_BadInternalCall();
8356 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357 }
8358 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008359 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361 fmt = PyUnicode_AS_UNICODE(uformat);
8362 fmtcnt = PyUnicode_GET_SIZE(uformat);
8363
8364 reslen = rescnt = fmtcnt + 100;
8365 result = _PyUnicode_New(reslen);
8366 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008367 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368 res = PyUnicode_AS_UNICODE(result);
8369
8370 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008371 arglen = PyTuple_Size(args);
8372 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 }
8374 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008375 arglen = -1;
8376 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377 }
Benjamin Petersonda2c7eb2013-03-23 22:32:00 -05008378 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8379 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008380 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381
8382 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008383 if (*fmt != '%') {
8384 if (--rescnt < 0) {
8385 rescnt = fmtcnt + 100;
8386 reslen += rescnt;
8387 if (_PyUnicode_Resize(&result, reslen) < 0)
8388 goto onError;
8389 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8390 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008391 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008392 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008393 }
8394 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008395 /* Got a format specifier */
8396 int flags = 0;
8397 Py_ssize_t width = -1;
8398 int prec = -1;
8399 Py_UNICODE c = '\0';
8400 Py_UNICODE fill;
8401 int isnumok;
8402 PyObject *v = NULL;
8403 PyObject *temp = NULL;
8404 Py_UNICODE *pbuf;
8405 Py_UNICODE sign;
8406 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008407 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008408
8409 fmt++;
8410 if (*fmt == '(') {
8411 Py_UNICODE *keystart;
8412 Py_ssize_t keylen;
8413 PyObject *key;
8414 int pcount = 1;
8415
8416 if (dict == NULL) {
8417 PyErr_SetString(PyExc_TypeError,
8418 "format requires a mapping");
8419 goto onError;
8420 }
8421 ++fmt;
8422 --fmtcnt;
8423 keystart = fmt;
8424 /* Skip over balanced parentheses */
8425 while (pcount > 0 && --fmtcnt >= 0) {
8426 if (*fmt == ')')
8427 --pcount;
8428 else if (*fmt == '(')
8429 ++pcount;
8430 fmt++;
8431 }
8432 keylen = fmt - keystart - 1;
8433 if (fmtcnt < 0 || pcount > 0) {
8434 PyErr_SetString(PyExc_ValueError,
8435 "incomplete format key");
8436 goto onError;
8437 }
8438#if 0
8439 /* keys are converted to strings using UTF-8 and
8440 then looked up since Python uses strings to hold
8441 variables names etc. in its namespaces and we
8442 wouldn't want to break common idioms. */
8443 key = PyUnicode_EncodeUTF8(keystart,
8444 keylen,
8445 NULL);
8446#else
8447 key = PyUnicode_FromUnicode(keystart, keylen);
8448#endif
8449 if (key == NULL)
8450 goto onError;
8451 if (args_owned) {
8452 Py_DECREF(args);
8453 args_owned = 0;
8454 }
8455 args = PyObject_GetItem(dict, key);
8456 Py_DECREF(key);
8457 if (args == NULL) {
8458 goto onError;
8459 }
8460 args_owned = 1;
8461 arglen = -1;
8462 argidx = -2;
8463 }
8464 while (--fmtcnt >= 0) {
8465 switch (c = *fmt++) {
8466 case '-': flags |= F_LJUST; continue;
8467 case '+': flags |= F_SIGN; continue;
8468 case ' ': flags |= F_BLANK; continue;
8469 case '#': flags |= F_ALT; continue;
8470 case '0': flags |= F_ZERO; continue;
8471 }
8472 break;
8473 }
8474 if (c == '*') {
8475 v = getnextarg(args, arglen, &argidx);
8476 if (v == NULL)
8477 goto onError;
8478 if (!PyInt_Check(v)) {
8479 PyErr_SetString(PyExc_TypeError,
8480 "* wants int");
8481 goto onError;
8482 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008483 width = PyInt_AsSsize_t(v);
8484 if (width == -1 && PyErr_Occurred())
8485 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008486 if (width < 0) {
8487 flags |= F_LJUST;
8488 width = -width;
8489 }
8490 if (--fmtcnt >= 0)
8491 c = *fmt++;
8492 }
8493 else if (c >= '0' && c <= '9') {
8494 width = c - '0';
8495 while (--fmtcnt >= 0) {
8496 c = *fmt++;
8497 if (c < '0' || c > '9')
8498 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008499 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008500 PyErr_SetString(PyExc_ValueError,
8501 "width too big");
8502 goto onError;
8503 }
8504 width = width*10 + (c - '0');
8505 }
8506 }
8507 if (c == '.') {
8508 prec = 0;
8509 if (--fmtcnt >= 0)
8510 c = *fmt++;
8511 if (c == '*') {
8512 v = getnextarg(args, arglen, &argidx);
8513 if (v == NULL)
8514 goto onError;
8515 if (!PyInt_Check(v)) {
8516 PyErr_SetString(PyExc_TypeError,
8517 "* wants int");
8518 goto onError;
8519 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008520 prec = _PyInt_AsInt(v);
8521 if (prec == -1 && PyErr_Occurred())
8522 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008523 if (prec < 0)
8524 prec = 0;
8525 if (--fmtcnt >= 0)
8526 c = *fmt++;
8527 }
8528 else if (c >= '0' && c <= '9') {
8529 prec = c - '0';
8530 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008531 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008532 if (c < '0' || c > '9')
8533 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008534 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008535 PyErr_SetString(PyExc_ValueError,
8536 "prec too big");
8537 goto onError;
8538 }
8539 prec = prec*10 + (c - '0');
8540 }
8541 }
8542 } /* prec */
8543 if (fmtcnt >= 0) {
8544 if (c == 'h' || c == 'l' || c == 'L') {
8545 if (--fmtcnt >= 0)
8546 c = *fmt++;
8547 }
8548 }
8549 if (fmtcnt < 0) {
8550 PyErr_SetString(PyExc_ValueError,
8551 "incomplete format");
8552 goto onError;
8553 }
8554 if (c != '%') {
8555 v = getnextarg(args, arglen, &argidx);
8556 if (v == NULL)
8557 goto onError;
8558 }
8559 sign = 0;
8560 fill = ' ';
8561 switch (c) {
8562
8563 case '%':
8564 pbuf = formatbuf;
8565 /* presume that buffer length is at least 1 */
8566 pbuf[0] = '%';
8567 len = 1;
8568 break;
8569
8570 case 's':
8571 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008572 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008573 temp = v;
8574 Py_INCREF(temp);
8575 }
8576 else {
8577 PyObject *unicode;
8578 if (c == 's')
8579 temp = PyObject_Unicode(v);
8580 else
8581 temp = PyObject_Repr(v);
8582 if (temp == NULL)
8583 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008584 if (PyUnicode_Check(temp))
8585 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008586 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008587 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008588 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8589 PyString_GET_SIZE(temp),
8590 NULL,
8591 "strict");
8592 Py_DECREF(temp);
8593 temp = unicode;
8594 if (temp == NULL)
8595 goto onError;
8596 }
8597 else {
8598 Py_DECREF(temp);
8599 PyErr_SetString(PyExc_TypeError,
8600 "%s argument has non-string str()");
8601 goto onError;
8602 }
8603 }
8604 pbuf = PyUnicode_AS_UNICODE(temp);
8605 len = PyUnicode_GET_SIZE(temp);
8606 if (prec >= 0 && len > prec)
8607 len = prec;
8608 break;
8609
8610 case 'i':
8611 case 'd':
8612 case 'u':
8613 case 'o':
8614 case 'x':
8615 case 'X':
8616 if (c == 'i')
8617 c = 'd';
8618 isnumok = 0;
8619 if (PyNumber_Check(v)) {
8620 PyObject *iobj=NULL;
8621
8622 if (PyInt_Check(v) || (PyLong_Check(v))) {
8623 iobj = v;
8624 Py_INCREF(iobj);
8625 }
8626 else {
8627 iobj = PyNumber_Int(v);
8628 if (iobj==NULL) iobj = PyNumber_Long(v);
8629 }
8630 if (iobj!=NULL) {
8631 if (PyInt_Check(iobj)) {
8632 isnumok = 1;
8633 pbuf = formatbuf;
8634 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8635 flags, prec, c, iobj);
8636 Py_DECREF(iobj);
8637 if (len < 0)
8638 goto onError;
8639 sign = 1;
8640 }
8641 else if (PyLong_Check(iobj)) {
8642 isnumok = 1;
8643 temp = formatlong(iobj, flags, prec, c);
8644 Py_DECREF(iobj);
8645 if (!temp)
8646 goto onError;
8647 pbuf = PyUnicode_AS_UNICODE(temp);
8648 len = PyUnicode_GET_SIZE(temp);
8649 sign = 1;
8650 }
8651 else {
8652 Py_DECREF(iobj);
8653 }
8654 }
8655 }
8656 if (!isnumok) {
8657 PyErr_Format(PyExc_TypeError,
8658 "%%%c format: a number is required, "
8659 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8660 goto onError;
8661 }
8662 if (flags & F_ZERO)
8663 fill = '0';
8664 break;
8665
8666 case 'e':
8667 case 'E':
8668 case 'f':
8669 case 'F':
8670 case 'g':
8671 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008672 temp = formatfloat(v, flags, prec, c);
8673 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008674 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008675 pbuf = PyUnicode_AS_UNICODE(temp);
8676 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008677 sign = 1;
8678 if (flags & F_ZERO)
8679 fill = '0';
8680 break;
8681
8682 case 'c':
8683 pbuf = formatbuf;
8684 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8685 if (len < 0)
8686 goto onError;
8687 break;
8688
8689 default:
8690 PyErr_Format(PyExc_ValueError,
8691 "unsupported format character '%c' (0x%x) "
8692 "at index %zd",
8693 (31<=c && c<=126) ? (char)c : '?',
8694 (int)c,
8695 (Py_ssize_t)(fmt - 1 -
8696 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008697 goto onError;
8698 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008699 if (sign) {
8700 if (*pbuf == '-' || *pbuf == '+') {
8701 sign = *pbuf++;
8702 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008703 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008704 else if (flags & F_SIGN)
8705 sign = '+';
8706 else if (flags & F_BLANK)
8707 sign = ' ';
8708 else
8709 sign = 0;
8710 }
8711 if (width < len)
8712 width = len;
8713 if (rescnt - (sign != 0) < width) {
8714 reslen -= rescnt;
8715 rescnt = width + fmtcnt + 100;
8716 reslen += rescnt;
8717 if (reslen < 0) {
8718 Py_XDECREF(temp);
8719 PyErr_NoMemory();
8720 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008721 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008722 if (_PyUnicode_Resize(&result, reslen) < 0) {
8723 Py_XDECREF(temp);
8724 goto onError;
8725 }
8726 res = PyUnicode_AS_UNICODE(result)
8727 + reslen - rescnt;
8728 }
8729 if (sign) {
8730 if (fill != ' ')
8731 *res++ = sign;
8732 rescnt--;
8733 if (width > len)
8734 width--;
8735 }
8736 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8737 assert(pbuf[0] == '0');
8738 assert(pbuf[1] == c);
8739 if (fill != ' ') {
8740 *res++ = *pbuf++;
8741 *res++ = *pbuf++;
8742 }
8743 rescnt -= 2;
8744 width -= 2;
8745 if (width < 0)
8746 width = 0;
8747 len -= 2;
8748 }
8749 if (width > len && !(flags & F_LJUST)) {
8750 do {
8751 --rescnt;
8752 *res++ = fill;
8753 } while (--width > len);
8754 }
8755 if (fill == ' ') {
8756 if (sign)
8757 *res++ = sign;
8758 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8759 assert(pbuf[0] == '0');
8760 assert(pbuf[1] == c);
8761 *res++ = *pbuf++;
8762 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008763 }
8764 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008765 Py_UNICODE_COPY(res, pbuf, len);
8766 res += len;
8767 rescnt -= len;
8768 while (--width >= len) {
8769 --rescnt;
8770 *res++ = ' ';
8771 }
8772 if (dict && (argidx < arglen) && c != '%') {
8773 PyErr_SetString(PyExc_TypeError,
8774 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008775 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008776 goto onError;
8777 }
8778 Py_XDECREF(temp);
8779 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780 } /* until end */
8781 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008782 PyErr_SetString(PyExc_TypeError,
8783 "not all arguments converted during string formatting");
8784 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785 }
8786
Thomas Woutersa96affe2006-03-12 00:29:36 +00008787 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008788 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008790 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 }
8792 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 return (PyObject *)result;
8794
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008795 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 Py_XDECREF(result);
8797 Py_DECREF(uformat);
8798 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008799 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 }
8801 return NULL;
8802}
8803
8804static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008805 (readbufferproc) unicode_buffer_getreadbuf,
8806 (writebufferproc) unicode_buffer_getwritebuf,
8807 (segcountproc) unicode_buffer_getsegcount,
8808 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008809};
8810
Jeremy Hylton938ace62002-07-17 16:30:39 +00008811static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008812unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8813
Tim Peters6d6c1a32001-08-02 04:15:00 +00008814static PyObject *
8815unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8816{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008817 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008818 static char *kwlist[] = {"string", "encoding", "errors", 0};
8819 char *encoding = NULL;
8820 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008821
Benjamin Peterson857ce152009-01-31 16:29:18 +00008822 if (type != &PyUnicode_Type)
8823 return unicode_subtype_new(type, args, kwds);
8824 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008825 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008826 return NULL;
8827 if (x == NULL)
8828 return (PyObject *)_PyUnicode_New(0);
8829 if (encoding == NULL && errors == NULL)
8830 return PyObject_Unicode(x);
8831 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008832 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008833}
8834
Guido van Rossume023fe02001-08-30 03:12:59 +00008835static PyObject *
8836unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8837{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008838 PyUnicodeObject *tmp, *pnew;
8839 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008840
Benjamin Peterson857ce152009-01-31 16:29:18 +00008841 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8842 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8843 if (tmp == NULL)
8844 return NULL;
8845 assert(PyUnicode_Check(tmp));
8846 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8847 if (pnew == NULL) {
8848 Py_DECREF(tmp);
8849 return NULL;
8850 }
8851 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8852 if (pnew->str == NULL) {
8853 _Py_ForgetReference((PyObject *)pnew);
8854 PyObject_Del(pnew);
8855 Py_DECREF(tmp);
8856 return PyErr_NoMemory();
8857 }
8858 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8859 pnew->length = n;
8860 pnew->hash = tmp->hash;
8861 Py_DECREF(tmp);
8862 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008863}
8864
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008865PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008866 "unicode(object='') -> unicode object\n\
8867unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008868\n\
8869Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008870encoding defaults to the current default string encoding.\n\
8871errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008872
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008874 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008875 "unicode", /* tp_name */
8876 sizeof(PyUnicodeObject), /* tp_size */
8877 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008879 (destructor)unicode_dealloc, /* tp_dealloc */
8880 0, /* tp_print */
8881 0, /* tp_getattr */
8882 0, /* tp_setattr */
8883 0, /* tp_compare */
8884 unicode_repr, /* tp_repr */
8885 &unicode_as_number, /* tp_as_number */
8886 &unicode_as_sequence, /* tp_as_sequence */
8887 &unicode_as_mapping, /* tp_as_mapping */
8888 (hashfunc) unicode_hash, /* tp_hash*/
8889 0, /* tp_call*/
8890 (reprfunc) unicode_str, /* tp_str */
8891 PyObject_GenericGetAttr, /* tp_getattro */
8892 0, /* tp_setattro */
8893 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008894 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008895 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008896 unicode_doc, /* tp_doc */
8897 0, /* tp_traverse */
8898 0, /* tp_clear */
8899 PyUnicode_RichCompare, /* tp_richcompare */
8900 0, /* tp_weaklistoffset */
8901 0, /* tp_iter */
8902 0, /* tp_iternext */
8903 unicode_methods, /* tp_methods */
8904 0, /* tp_members */
8905 0, /* tp_getset */
8906 &PyBaseString_Type, /* tp_base */
8907 0, /* tp_dict */
8908 0, /* tp_descr_get */
8909 0, /* tp_descr_set */
8910 0, /* tp_dictoffset */
8911 0, /* tp_init */
8912 0, /* tp_alloc */
8913 unicode_new, /* tp_new */
8914 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915};
8916
8917/* Initialize the Unicode implementation */
8918
Thomas Wouters78890102000-07-22 19:25:51 +00008919void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008921 /* XXX - move this array to unicodectype.c ? */
8922 Py_UNICODE linebreak[] = {
8923 0x000A, /* LINE FEED */
8924 0x000D, /* CARRIAGE RETURN */
8925 0x001C, /* FILE SEPARATOR */
8926 0x001D, /* GROUP SEPARATOR */
8927 0x001E, /* RECORD SEPARATOR */
8928 0x0085, /* NEXT LINE */
8929 0x2028, /* LINE SEPARATOR */
8930 0x2029, /* PARAGRAPH SEPARATOR */
8931 };
8932
Fred Drakee4315f52000-05-09 19:53:39 +00008933 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008934 if (!unicode_empty) {
8935 unicode_empty = _PyUnicode_New(0);
8936 if (!unicode_empty)
8937 return;
8938 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008939
Guido van Rossumcacfc072002-05-24 19:01:59 +00008940 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008941 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008942
8943 /* initialize the linebreak bloom filter */
8944 bloom_linebreak = make_bloom_mask(
8945 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8946 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008947
8948 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008949
8950 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8951 Py_FatalError("Can't initialize field name iterator type");
8952
8953 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8954 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955}
8956
8957/* Finalize the Unicode implementation */
8958
Christian Heimes3b718a72008-02-14 12:47:33 +00008959int
8960PyUnicode_ClearFreeList(void)
8961{
8962 int freelist_size = numfree;
8963 PyUnicodeObject *u;
8964
8965 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008966 PyUnicodeObject *v = u;
8967 u = *(PyUnicodeObject **)u;
8968 if (v->str)
8969 PyObject_DEL(v->str);
8970 Py_XDECREF(v->defenc);
8971 PyObject_Del(v);
8972 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008973 }
8974 free_list = NULL;
8975 assert(numfree == 0);
8976 return freelist_size;
8977}
8978
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979void
Thomas Wouters78890102000-07-22 19:25:51 +00008980_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008982 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008984 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008985
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008986 for (i = 0; i < 256; i++)
8987 Py_CLEAR(unicode_latin1[i]);
8988
Christian Heimes3b718a72008-02-14 12:47:33 +00008989 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008991
Anthony Baxterac6bd462006-04-13 02:06:09 +00008992#ifdef __cplusplus
8993}
8994#endif