blob: 1e3b812528b773ad3ce05254f8989d310501ed9c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Serhiy Storchakae822b032013-08-06 16:56:26 +0300550/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551 * by 'ptr', possibly combining surrogate pairs on narrow builds.
552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553 * that should be returned and 'end' pointing to the end of the buffer.
554 * ('end' is used on narrow builds to detect a lone surrogate at the
555 * end of the buffer that should be returned unchanged.)
556 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557 * The type of the returned char is always Py_UCS4.
558 *
559 * Note: the macro advances ptr to next char, so it might have side-effects
560 * (especially if used with other macros).
561 */
562
563/* helper macros used by _Py_UNICODE_NEXT */
564#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566/* Join two surrogate characters and return a single Py_UCS4 value. */
567#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
570
571#ifdef Py_UNICODE_WIDE
572#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
573#else
574#define _Py_UNICODE_NEXT(ptr, end) \
575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
578 (Py_UCS4)*(ptr)++)
579#endif
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581#ifdef HAVE_WCHAR_H
582
Mark Dickinson6b265f12009-03-18 16:07:26 +0000583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594{
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000643 PyErr_BadInternalCall();
644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 }
646
647 unicode = _PyUnicode_New(size);
648 if (!unicode)
649 return NULL;
650
651 /* Copy the wchar_t data into the new object */
652#ifdef HAVE_USABLE_WCHAR_T
653 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000654#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000656 register Py_UNICODE *u;
657 register Py_ssize_t i;
658 u = PyUnicode_AS_UNICODE(unicode);
659 for (i = size; i > 0; i--)
660 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662#endif
663
664 return (PyObject *)unicode;
665}
666
Mark Dickinson6b265f12009-03-18 16:07:26 +0000667#endif /* CONVERT_WCHAR_TO_SURROGATES */
668
669#undef CONVERT_WCHAR_TO_SURROGATES
670
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000671static void
672makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
673{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000674 *fmt++ = '%';
675 if (width) {
676 if (zeropad)
677 *fmt++ = '0';
678 fmt += sprintf(fmt, "%d", width);
679 }
680 if (precision)
681 fmt += sprintf(fmt, ".%d", precision);
682 if (longflag)
683 *fmt++ = 'l';
684 else if (size_tflag) {
685 char *f = PY_FORMAT_SIZE_T;
686 while (*f)
687 *fmt++ = *f++;
688 }
689 *fmt++ = c;
690 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000691}
692
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200693#define appendstring(string) \
694 do { \
695 for (copy = string;*copy; copy++) { \
696 *s++ = (unsigned char)*copy; \
697 } \
698 } while (0)
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000699
700PyObject *
701PyUnicode_FromFormatV(const char *format, va_list vargs)
702{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000703 va_list count;
704 Py_ssize_t callcount = 0;
705 PyObject **callresults = NULL;
706 PyObject **callresult = NULL;
707 Py_ssize_t n = 0;
708 int width = 0;
709 int precision = 0;
710 int zeropad;
711 const char* f;
712 Py_UNICODE *s;
713 PyObject *string;
714 /* used by sprintf */
715 char buffer[21];
716 /* use abuffer instead of buffer, if we need more space
717 * (which can happen if there's a format specifier with width). */
718 char *abuffer = NULL;
719 char *realbuffer;
720 Py_ssize_t abuffersize = 0;
721 char fmt[60]; /* should be enough for %0width.precisionld */
722 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000723
724#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726#else
727#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000728 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000729#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000730 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000731#endif
732#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000733 /* step 1: count the number of %S/%R/%s format specifications
734 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
735 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000736 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000737 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200738 f++;
739 while (*f && *f != '%' && !isalpha((unsigned)*f))
740 f++;
741 if (*f == 's' || *f=='S' || *f=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000742 ++callcount;
743 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000744 }
745 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000746 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000747 if (callcount) {
748 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
749 if (!callresults) {
750 PyErr_NoMemory();
751 return NULL;
752 }
753 callresult = callresults;
754 }
755 /* step 3: figure out how large a buffer we need */
756 for (f = format; *f; f++) {
757 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200758 const char* p = f++;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000759 width = 0;
760 while (isdigit((unsigned)*f))
761 width = (width*10) + *f++ - '0';
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200762 precision = 0;
763 if (*f == '.') {
764 f++;
765 while (isdigit((unsigned)*f))
766 precision = (precision*10) + *f++ - '0';
767 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000768
Benjamin Peterson857ce152009-01-31 16:29:18 +0000769 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
770 * they don't affect the amount of space we reserve.
771 */
772 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000773 (f[1] == 'd' || f[1] == 'u'))
774 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000775
Benjamin Peterson857ce152009-01-31 16:29:18 +0000776 switch (*f) {
777 case 'c':
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300778 {
779 int ordinal = va_arg(count, int);
780#ifdef Py_UNICODE_WIDE
781 if (ordinal < 0 || ordinal > 0x10ffff) {
782 PyErr_SetString(PyExc_OverflowError,
783 "%c arg not in range(0x110000) "
784 "(wide Python build)");
785 goto fail;
786 }
787#else
788 if (ordinal < 0 || ordinal > 0xffff) {
789 PyErr_SetString(PyExc_OverflowError,
790 "%c arg not in range(0x10000) "
791 "(narrow Python build)");
792 goto fail;
793 }
794#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000795 /* fall through... */
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300796 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000797 case '%':
798 n++;
799 break;
800 case 'd': case 'u': case 'i': case 'x':
801 (void) va_arg(count, int);
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200802 if (width < precision)
803 width = precision;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000804 /* 20 bytes is enough to hold a 64-bit
805 integer. Decimal takes the most space.
806 This isn't enough for octal.
807 If a width is specified we need more
808 (which we allocate later). */
809 if (width < 20)
810 width = 20;
811 n += width;
812 if (abuffersize < width)
813 abuffersize = width;
814 break;
815 case 's':
816 {
817 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000818 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000819 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
820 if (!str)
821 goto fail;
822 n += PyUnicode_GET_SIZE(str);
823 /* Remember the str and switch to the next slot */
824 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000825 break;
826 }
827 case 'U':
828 {
829 PyObject *obj = va_arg(count, PyObject *);
830 assert(obj && PyUnicode_Check(obj));
831 n += PyUnicode_GET_SIZE(obj);
832 break;
833 }
834 case 'V':
835 {
836 PyObject *obj = va_arg(count, PyObject *);
837 const char *str = va_arg(count, const char *);
838 assert(obj || str);
839 assert(!obj || PyUnicode_Check(obj));
840 if (obj)
841 n += PyUnicode_GET_SIZE(obj);
842 else
843 n += strlen(str);
844 break;
845 }
846 case 'S':
847 {
848 PyObject *obj = va_arg(count, PyObject *);
849 PyObject *str;
850 assert(obj);
851 str = PyObject_Str(obj);
852 if (!str)
853 goto fail;
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200854 n += PyString_GET_SIZE(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000855 /* Remember the str and switch to the next slot */
856 *callresult++ = str;
857 break;
858 }
859 case 'R':
860 {
861 PyObject *obj = va_arg(count, PyObject *);
862 PyObject *repr;
863 assert(obj);
864 repr = PyObject_Repr(obj);
865 if (!repr)
866 goto fail;
867 n += PyUnicode_GET_SIZE(repr);
868 /* Remember the repr and switch to the next slot */
869 *callresult++ = repr;
870 break;
871 }
872 case 'p':
873 (void) va_arg(count, int);
874 /* maximum 64-bit pointer representation:
875 * 0xffffffffffffffff
876 * so 19 characters is enough.
877 * XXX I count 18 -- what's the extra for?
878 */
879 n += 19;
880 break;
881 default:
882 /* if we stumble upon an unknown
883 formatting code, copy the rest of
884 the format string to the output
885 string. (we cannot just skip the
886 code, since there's no way to know
887 what's in the argument list) */
888 n += strlen(p);
889 goto expand;
890 }
891 } else
892 n++;
893 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000894 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000895 if (abuffersize > 20) {
Serhiy Storchaka5ec0bbf2015-01-30 23:35:03 +0200896 /* add 1 for sprintf's trailing null byte */
897 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000898 if (!abuffer) {
899 PyErr_NoMemory();
900 goto fail;
901 }
902 realbuffer = abuffer;
903 }
904 else
905 realbuffer = buffer;
906 /* step 4: fill the buffer */
907 /* Since we've analyzed how much space we need for the worst case,
908 we don't have to resize the string.
909 There can be no errors beyond this point. */
910 string = PyUnicode_FromUnicode(NULL, n);
911 if (!string)
912 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000913
Benjamin Peterson857ce152009-01-31 16:29:18 +0000914 s = PyUnicode_AS_UNICODE(string);
915 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000916
Benjamin Peterson857ce152009-01-31 16:29:18 +0000917 for (f = format; *f; f++) {
918 if (*f == '%') {
919 const char* p = f++;
920 int longflag = 0;
921 int size_tflag = 0;
922 zeropad = (*f == '0');
923 /* parse the width.precision part */
924 width = 0;
925 while (isdigit((unsigned)*f))
926 width = (width*10) + *f++ - '0';
927 precision = 0;
928 if (*f == '.') {
929 f++;
930 while (isdigit((unsigned)*f))
931 precision = (precision*10) + *f++ - '0';
932 }
933 /* handle the long flag, but only for %ld and %lu.
934 others can be added when necessary. */
935 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
936 longflag = 1;
937 ++f;
938 }
939 /* handle the size_t flag. */
940 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
941 size_tflag = 1;
942 ++f;
943 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000944
Benjamin Peterson857ce152009-01-31 16:29:18 +0000945 switch (*f) {
946 case 'c':
947 *s++ = va_arg(vargs, int);
948 break;
949 case 'd':
950 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
951 if (longflag)
952 sprintf(realbuffer, fmt, va_arg(vargs, long));
953 else if (size_tflag)
954 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
955 else
956 sprintf(realbuffer, fmt, va_arg(vargs, int));
957 appendstring(realbuffer);
958 break;
959 case 'u':
960 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
961 if (longflag)
962 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
963 else if (size_tflag)
964 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
965 else
966 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
967 appendstring(realbuffer);
968 break;
969 case 'i':
970 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
971 sprintf(realbuffer, fmt, va_arg(vargs, int));
972 appendstring(realbuffer);
973 break;
974 case 'x':
975 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
976 sprintf(realbuffer, fmt, va_arg(vargs, int));
977 appendstring(realbuffer);
978 break;
979 case 's':
980 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000981 /* unused, since we already have the result */
982 (void) va_arg(vargs, char *);
983 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
984 PyUnicode_GET_SIZE(*callresult));
985 s += PyUnicode_GET_SIZE(*callresult);
986 /* We're done with the unicode()/repr() => forget it */
987 Py_DECREF(*callresult);
988 /* switch to next unicode()/repr() result */
989 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000990 break;
991 }
992 case 'U':
993 {
994 PyObject *obj = va_arg(vargs, PyObject *);
995 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
996 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
997 s += size;
998 break;
999 }
1000 case 'V':
1001 {
1002 PyObject *obj = va_arg(vargs, PyObject *);
1003 const char *str = va_arg(vargs, const char *);
1004 if (obj) {
1005 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1006 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1007 s += size;
1008 } else {
1009 appendstring(str);
1010 }
1011 break;
1012 }
1013 case 'S':
1014 case 'R':
1015 {
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001016 const char *str = PyString_AS_STRING(*callresult);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001017 /* unused, since we already have the result */
1018 (void) va_arg(vargs, PyObject *);
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001019 appendstring(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001020 /* We're done with the unicode()/repr() => forget it */
1021 Py_DECREF(*callresult);
1022 /* switch to next unicode()/repr() result */
1023 ++callresult;
1024 break;
1025 }
1026 case 'p':
1027 sprintf(buffer, "%p", va_arg(vargs, void*));
1028 /* %p is ill-defined: ensure leading 0x. */
1029 if (buffer[1] == 'X')
1030 buffer[1] = 'x';
1031 else if (buffer[1] != 'x') {
1032 memmove(buffer+2, buffer, strlen(buffer)+1);
1033 buffer[0] = '0';
1034 buffer[1] = 'x';
1035 }
1036 appendstring(buffer);
1037 break;
1038 case '%':
1039 *s++ = '%';
1040 break;
1041 default:
1042 appendstring(p);
1043 goto end;
1044 }
1045 } else
1046 *s++ = *f;
1047 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001048
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001049 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001050 if (callresults)
1051 PyObject_Free(callresults);
1052 if (abuffer)
1053 PyObject_Free(abuffer);
1054 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1055 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001056 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001057 if (callresults) {
1058 PyObject **callresult2 = callresults;
1059 while (callresult2 < callresult) {
1060 Py_DECREF(*callresult2);
1061 ++callresult2;
1062 }
1063 PyObject_Free(callresults);
1064 }
1065 if (abuffer)
1066 PyObject_Free(abuffer);
1067 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001068}
1069
1070#undef appendstring
1071
1072PyObject *
1073PyUnicode_FromFormat(const char *format, ...)
1074{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001075 PyObject* ret;
1076 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001077
1078#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001079 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001080#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001081 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001082#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001083 ret = PyUnicode_FromFormatV(format, vargs);
1084 va_end(vargs);
1085 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001086}
1087
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001089 wchar_t *w,
1090 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091{
1092 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001093 PyErr_BadInternalCall();
1094 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001096
1097 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001099 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001100
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101#ifdef HAVE_USABLE_WCHAR_T
1102 memcpy(w, unicode->str, size * sizeof(wchar_t));
1103#else
1104 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001105 register Py_UNICODE *u;
1106 register Py_ssize_t i;
1107 u = PyUnicode_AS_UNICODE(unicode);
1108 for (i = size; i > 0; i--)
1109 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 }
1111#endif
1112
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001113 if (size > PyUnicode_GET_SIZE(unicode))
1114 return PyUnicode_GET_SIZE(unicode);
1115 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001116 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117}
1118
1119#endif
1120
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001121PyObject *PyUnicode_FromOrdinal(int ordinal)
1122{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001123 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001124
1125#ifdef Py_UNICODE_WIDE
1126 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001127 PyErr_SetString(PyExc_ValueError,
1128 "unichr() arg not in range(0x110000) "
1129 "(wide Python build)");
1130 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001131 }
1132#else
1133 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001134 PyErr_SetString(PyExc_ValueError,
1135 "unichr() arg not in range(0x10000) "
1136 "(narrow Python build)");
1137 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001138 }
1139#endif
1140
Hye-Shik Chang40574832004-04-06 07:24:51 +00001141 s[0] = (Py_UNICODE)ordinal;
1142 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001143}
1144
Guido van Rossumd57fd912000-03-10 22:53:23 +00001145PyObject *PyUnicode_FromObject(register PyObject *obj)
1146{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001147 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001148 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001149 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001150 Py_INCREF(obj);
1151 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001152 }
1153 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001154 /* For a Unicode subtype that's not a Unicode object,
1155 return a true Unicode object with the same data. */
1156 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1157 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001158 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001159 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1160}
1161
1162PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001163 const char *encoding,
1164 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001165{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001166 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001168 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001169
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001171 PyErr_BadInternalCall();
1172 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001174
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001175#if 0
1176 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001177 that no encodings is given and then redirect to
1178 PyObject_Unicode() which then applies the additional logic for
1179 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001180
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001181 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001182 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001183
1184 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001185 if (PyUnicode_Check(obj)) {
1186 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001187 PyErr_SetString(PyExc_TypeError,
1188 "decoding Unicode is not supported");
1189 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001190 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001191 return PyObject_Unicode(obj);
1192 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001193#else
1194 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001195 PyErr_SetString(PyExc_TypeError,
1196 "decoding Unicode is not supported");
1197 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001198 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001199#endif
1200
1201 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001202 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001203 s = PyString_AS_STRING(obj);
1204 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001205 }
Christian Heimes3497f942008-05-26 12:29:14 +00001206 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001207 /* Python 2.x specific */
1208 PyErr_Format(PyExc_TypeError,
1209 "decoding bytearray is not supported");
1210 return NULL;
1211 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001212 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001213 /* Overwrite the error message with something more useful in
1214 case of a TypeError. */
1215 if (PyErr_ExceptionMatches(PyExc_TypeError))
1216 PyErr_Format(PyExc_TypeError,
1217 "coercing to Unicode: need string or buffer, "
1218 "%.80s found",
1219 Py_TYPE(obj)->tp_name);
1220 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001221 }
Tim Petersced69f82003-09-16 20:30:58 +00001222
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001223 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001224 if (len == 0)
1225 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001226
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001227 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001228 return v;
1229
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001230 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232}
1233
1234PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001235 Py_ssize_t size,
1236 const char *encoding,
1237 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238{
1239 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001240
1241 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001242 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001243
1244 /* Shortcuts for common default encodings */
1245 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001247 else if (strcmp(encoding, "latin-1") == 0)
1248 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001249#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1250 else if (strcmp(encoding, "mbcs") == 0)
1251 return PyUnicode_DecodeMBCS(s, size, errors);
1252#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001253 else if (strcmp(encoding, "ascii") == 0)
1254 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255
1256 /* Decode via the codec registry */
1257 buffer = PyBuffer_FromMemory((void *)s, size);
1258 if (buffer == NULL)
1259 goto onError;
1260 unicode = PyCodec_Decode(buffer, encoding, errors);
1261 if (unicode == NULL)
1262 goto onError;
1263 if (!PyUnicode_Check(unicode)) {
1264 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001265 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001266 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267 Py_DECREF(unicode);
1268 goto onError;
1269 }
1270 Py_DECREF(buffer);
1271 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001272
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001273 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274 Py_XDECREF(buffer);
1275 return NULL;
1276}
1277
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001278PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1279 const char *encoding,
1280 const char *errors)
1281{
1282 PyObject *v;
1283
1284 if (!PyUnicode_Check(unicode)) {
1285 PyErr_BadArgument();
1286 goto onError;
1287 }
1288
1289 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001290 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001291
1292 /* Decode via the codec registry */
1293 v = PyCodec_Decode(unicode, encoding, errors);
1294 if (v == NULL)
1295 goto onError;
1296 return v;
1297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001298 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001299 return NULL;
1300}
1301
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001303 Py_ssize_t size,
1304 const char *encoding,
1305 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306{
1307 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001308
Guido van Rossumd57fd912000-03-10 22:53:23 +00001309 unicode = PyUnicode_FromUnicode(s, size);
1310 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1313 Py_DECREF(unicode);
1314 return v;
1315}
1316
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001317PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1318 const char *encoding,
1319 const char *errors)
1320{
1321 PyObject *v;
1322
1323 if (!PyUnicode_Check(unicode)) {
1324 PyErr_BadArgument();
1325 goto onError;
1326 }
1327
1328 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001329 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001330
1331 /* Encode via the codec registry */
1332 v = PyCodec_Encode(unicode, encoding, errors);
1333 if (v == NULL)
1334 goto onError;
1335 return v;
1336
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001337 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001338 return NULL;
1339}
1340
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1342 const char *encoding,
1343 const char *errors)
1344{
1345 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001346
Guido van Rossumd57fd912000-03-10 22:53:23 +00001347 if (!PyUnicode_Check(unicode)) {
1348 PyErr_BadArgument();
1349 goto onError;
1350 }
Fred Drakee4315f52000-05-09 19:53:39 +00001351
Tim Petersced69f82003-09-16 20:30:58 +00001352 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001353 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001354
1355 /* Shortcuts for common default encodings */
1356 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001357 if (strcmp(encoding, "utf-8") == 0)
1358 return PyUnicode_AsUTF8String(unicode);
1359 else if (strcmp(encoding, "latin-1") == 0)
1360 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001361#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001362 else if (strcmp(encoding, "mbcs") == 0)
1363 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001364#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001365 else if (strcmp(encoding, "ascii") == 0)
1366 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001368
1369 /* Encode via the codec registry */
1370 v = PyCodec_Encode(unicode, encoding, errors);
1371 if (v == NULL)
1372 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001373 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001374 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001375 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001376 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 Py_DECREF(v);
1378 goto onError;
1379 }
1380 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001381
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001382 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 return NULL;
1384}
1385
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001386PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001387 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001388{
1389 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1390
1391 if (v)
1392 return v;
1393 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1394 if (v && errors == NULL)
1395 ((PyUnicodeObject *)unicode)->defenc = v;
1396 return v;
1397}
1398
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1400{
1401 if (!PyUnicode_Check(unicode)) {
1402 PyErr_BadArgument();
1403 goto onError;
1404 }
1405 return PyUnicode_AS_UNICODE(unicode);
1406
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001407 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001408 return NULL;
1409}
1410
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412{
1413 if (!PyUnicode_Check(unicode)) {
1414 PyErr_BadArgument();
1415 goto onError;
1416 }
1417 return PyUnicode_GET_SIZE(unicode);
1418
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001419 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420 return -1;
1421}
1422
Thomas Wouters78890102000-07-22 19:25:51 +00001423const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001424{
1425 return unicode_default_encoding;
1426}
1427
1428int PyUnicode_SetDefaultEncoding(const char *encoding)
1429{
1430 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001431
Fred Drakee4315f52000-05-09 19:53:39 +00001432 /* Make sure the encoding is valid. As side effect, this also
1433 loads the encoding into the codec registry cache. */
1434 v = _PyCodec_Lookup(encoding);
1435 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001436 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001437 Py_DECREF(v);
1438 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001439 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001440 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001441 return 0;
1442
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001443 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001444 return -1;
1445}
1446
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001447/* error handling callback helper:
1448 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001449 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001450 and adjust various state variables.
1451 return 0 on success, -1 on error
1452*/
1453
1454static
1455int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001456 const char *encoding, const char *reason,
1457 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1458 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1459 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001460{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001461 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001462
1463 PyObject *restuple = NULL;
1464 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001465 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1466 Py_ssize_t requiredsize;
1467 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001468 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001469 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001470 int res = -1;
1471
1472 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001473 *errorHandler = PyCodec_LookupError(errors);
1474 if (*errorHandler == NULL)
1475 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001476 }
1477
1478 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001479 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001480 encoding, input, insize, *startinpos, *endinpos, reason);
1481 if (*exceptionObject == NULL)
1482 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001483 }
1484 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001485 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1486 goto onError;
1487 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1488 goto onError;
1489 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1490 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001491 }
1492
1493 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1494 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001495 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001496 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001497 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001498 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001499 }
1500 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001501 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001502 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001503 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001504 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001505 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1506 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001507 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001508
1509 /* need more space? (at least enough for what we
1510 have+the replacement+the rest of the string (starting
1511 at the new input position), so we won't have to check space
1512 when there are no errors in the rest of the string) */
1513 repptr = PyUnicode_AS_UNICODE(repunicode);
1514 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001515 requiredsize = *outpos;
1516 if (requiredsize > PY_SSIZE_T_MAX - repsize)
1517 goto overflow;
1518 requiredsize += repsize;
1519 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1520 goto overflow;
1521 requiredsize += insize - newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001522 if (requiredsize > outsize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001523 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001524 requiredsize = 2*outsize;
1525 if (_PyUnicode_Resize(output, requiredsize) < 0)
1526 goto onError;
1527 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001528 }
1529 *endinpos = newpos;
1530 *inptr = input + newpos;
1531 Py_UNICODE_COPY(*outptr, repptr, repsize);
1532 *outptr += repsize;
1533 *outpos += repsize;
1534 /* we made it! */
1535 res = 0;
1536
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001537 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001538 Py_XDECREF(restuple);
1539 return res;
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001540
1541 overflow:
1542 PyErr_SetString(PyExc_OverflowError,
1543 "decoded result is too long for a Python string");
1544 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545}
1546
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547/* --- UTF-7 Codec -------------------------------------------------------- */
1548
Antoine Pitrou653dece2009-05-04 18:32:32 +00001549/* See RFC2152 for details. We encode conservatively and decode liberally. */
1550
1551/* Three simple macros defining base-64. */
1552
1553/* Is c a base-64 character? */
1554
1555#define IS_BASE64(c) \
1556 (isalnum(c) || (c) == '+' || (c) == '/')
1557
1558/* given that c is a base-64 character, what is its base-64 value? */
1559
1560#define FROM_BASE64(c) \
1561 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1562 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1563 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1564 (c) == '+' ? 62 : 63)
1565
1566/* What is the base-64 character of the bottom 6 bits of n? */
1567
1568#define TO_BASE64(n) \
1569 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1570
1571/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1572 * decoded as itself. We are permissive on decoding; the only ASCII
1573 * byte not decoding to itself is the + which begins a base64
1574 * string. */
1575
1576#define DECODE_DIRECT(c) \
1577 ((c) <= 127 && (c) != '+')
1578
1579/* The UTF-7 encoder treats ASCII characters differently according to
1580 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1581 * the above). See RFC2152. This array identifies these different
1582 * sets:
1583 * 0 : "Set D"
1584 * alphanumeric and '(),-./:?
1585 * 1 : "Set O"
1586 * !"#$%&*;<=>@[]^_`{|}
1587 * 2 : "whitespace"
1588 * ht nl cr sp
1589 * 3 : special (must be base64 encoded)
1590 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1591 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592
Tim Petersced69f82003-09-16 20:30:58 +00001593static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001594char utf7_category[128] = {
1595/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1596 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1597/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1598 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1599/* sp ! " # $ % & ' ( ) * + , - . / */
1600 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1601/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1602 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1603/* @ A B C D E F G H I J K L M N O */
1604 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1605/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1606 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1607/* ` a b c d e f g h i j k l m n o */
1608 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1609/* p q r s t u v w x y z { | } ~ del */
1610 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611};
1612
Antoine Pitrou653dece2009-05-04 18:32:32 +00001613/* ENCODE_DIRECT: this character should be encoded as itself. The
1614 * answer depends on whether we are encoding set O as itself, and also
1615 * on whether we are encoding whitespace as itself. RFC2152 makes it
1616 * clear that the answers to these questions vary between
1617 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001618
Antoine Pitrou653dece2009-05-04 18:32:32 +00001619#define ENCODE_DIRECT(c, directO, directWS) \
1620 ((c) < 128 && (c) > 0 && \
1621 ((utf7_category[(c)] == 0) || \
1622 (directWS && (utf7_category[(c)] == 2)) || \
1623 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001625PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001626 Py_ssize_t size,
1627 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001628{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001629 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1630}
1631
Antoine Pitrou653dece2009-05-04 18:32:32 +00001632/* The decoder. The only state we preserve is our read position,
1633 * i.e. how many characters we have consumed. So if we end in the
1634 * middle of a shift sequence we have to back off the read position
1635 * and the output to the beginning of the sequence, otherwise we lose
1636 * all the shift state (seen bits, number of bits seen, high
1637 * surrogate). */
1638
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001639PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001640 Py_ssize_t size,
1641 const char *errors,
1642 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001643{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001645 Py_ssize_t startinpos;
1646 Py_ssize_t endinpos;
1647 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648 const char *e;
1649 PyUnicodeObject *unicode;
1650 Py_UNICODE *p;
1651 const char *errmsg = "";
1652 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001653 Py_UNICODE *shiftOutStart;
1654 unsigned int base64bits = 0;
1655 unsigned long base64buffer = 0;
1656 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001657 PyObject *errorHandler = NULL;
1658 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659
1660 unicode = _PyUnicode_New(size);
1661 if (!unicode)
1662 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001663 if (size == 0) {
1664 if (consumed)
1665 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001667 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001668
1669 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001670 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001671 e = s + size;
1672
1673 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001674 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675
Antoine Pitrou653dece2009-05-04 18:32:32 +00001676 if (inShift) { /* in a base-64 section */
1677 if (IS_BASE64(ch)) { /* consume a base-64 character */
1678 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1679 base64bits += 6;
1680 s++;
1681 if (base64bits >= 16) {
1682 /* we have enough bits for a UTF-16 value */
1683 Py_UNICODE outCh = (Py_UNICODE)
1684 (base64buffer >> (base64bits-16));
1685 base64bits -= 16;
1686 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001687 assert(outCh <= 0xffff);
Antoine Pitrou653dece2009-05-04 18:32:32 +00001688 if (surrogate) {
1689 /* expecting a second surrogate */
1690 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1691#ifdef Py_UNICODE_WIDE
1692 *p++ = (((surrogate & 0x3FF)<<10)
1693 | (outCh & 0x3FF)) + 0x10000;
1694#else
1695 *p++ = surrogate;
1696 *p++ = outCh;
1697#endif
1698 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001699 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001700 }
1701 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001702 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001703 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001704 }
1705 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001706 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001707 /* first surrogate */
1708 surrogate = outCh;
1709 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001710 else {
1711 *p++ = outCh;
1712 }
1713 }
1714 }
1715 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716 inShift = 0;
1717 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001718 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001719 *p++ = surrogate;
1720 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001721 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001722 if (base64bits > 0) { /* left-over bits */
1723 if (base64bits >= 6) {
1724 /* We've seen at least one base-64 character */
1725 errmsg = "partial character in shift sequence";
1726 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001727 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001728 else {
1729 /* Some bits remain; they should be zero */
1730 if (base64buffer != 0) {
1731 errmsg = "non-zero padding bits in shift sequence";
1732 goto utf7Error;
1733 }
1734 }
1735 }
1736 if (ch != '-') {
1737 /* '-' is absorbed; other terminating
1738 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001739 *p++ = ch;
1740 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001741 }
1742 }
1743 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001744 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001745 s++; /* consume '+' */
1746 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001747 s++;
1748 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001749 }
1750 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001751 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001752 shiftOutStart = p;
1753 base64bits = 0;
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001754 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 }
1756 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001757 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001758 *p++ = ch;
1759 s++;
1760 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001761 else {
1762 startinpos = s-starts;
1763 s++;
1764 errmsg = "unexpected special character";
1765 goto utf7Error;
1766 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001768utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001769 outpos = p-PyUnicode_AS_UNICODE(unicode);
1770 endinpos = s-starts;
1771 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001772 errors, &errorHandler,
1773 "utf7", errmsg,
1774 starts, size, &startinpos, &endinpos, &exc, &s,
1775 &unicode, &outpos, &p))
1776 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001777 }
1778
Antoine Pitrou653dece2009-05-04 18:32:32 +00001779 /* end of string */
1780
1781 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1782 /* if we're in an inconsistent state, that's an error */
1783 if (surrogate ||
1784 (base64bits >= 6) ||
1785 (base64bits > 0 && base64buffer != 0)) {
1786 outpos = p-PyUnicode_AS_UNICODE(unicode);
1787 endinpos = size;
1788 if (unicode_decode_call_errorhandler(
1789 errors, &errorHandler,
1790 "utf7", "unterminated shift sequence",
1791 starts, size, &startinpos, &endinpos, &exc, &s,
1792 &unicode, &outpos, &p))
1793 goto onError;
1794 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001795 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001796
1797 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001798 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001799 if (inShift) {
1800 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001801 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001802 }
1803 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001804 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001805 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001806 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001807
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001808 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001809 goto onError;
1810
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 Py_XDECREF(errorHandler);
1812 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001813 return (PyObject *)unicode;
1814
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001815 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 Py_XDECREF(errorHandler);
1817 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001818 Py_DECREF(unicode);
1819 return NULL;
1820}
1821
1822
1823PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001824 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001825 int base64SetO,
1826 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001827 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001828{
1829 PyObject *v;
1830 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001831 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001832 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001833 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001834 unsigned int base64bits = 0;
1835 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001836 char * out;
1837 char * start;
1838
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001839 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001840 return PyErr_NoMemory();
1841
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001842 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001843 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844
Antoine Pitrou653dece2009-05-04 18:32:32 +00001845 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001846 if (v == NULL)
1847 return NULL;
1848
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001849 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850 for (;i < size; ++i) {
1851 Py_UNICODE ch = s[i];
1852
Antoine Pitrou653dece2009-05-04 18:32:32 +00001853 if (inShift) {
1854 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1855 /* shifting out */
1856 if (base64bits) { /* output remaining bits */
1857 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1858 base64buffer = 0;
1859 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001860 }
1861 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001862 /* Characters not in the BASE64 set implicitly unshift the sequence
1863 so no '-' is required, except if the character is itself a '-' */
1864 if (IS_BASE64(ch) || ch == '-') {
1865 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001866 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001867 *out++ = (char) ch;
1868 }
1869 else {
1870 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001871 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001873 else { /* not in a shift sequence */
1874 if (ch == '+') {
1875 *out++ = '+';
1876 *out++ = '-';
1877 }
1878 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1879 *out++ = (char) ch;
1880 }
1881 else {
1882 *out++ = '+';
1883 inShift = 1;
1884 goto encode_char;
1885 }
1886 }
1887 continue;
1888encode_char:
1889#ifdef Py_UNICODE_WIDE
1890 if (ch >= 0x10000) {
1891 /* code first surrogate */
1892 base64bits += 16;
1893 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1894 while (base64bits >= 6) {
1895 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1896 base64bits -= 6;
1897 }
1898 /* prepare second surrogate */
1899 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1900 }
1901#endif
1902 base64bits += 16;
1903 base64buffer = (base64buffer << 16) | ch;
1904 while (base64bits >= 6) {
1905 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1906 base64bits -= 6;
1907 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001908 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001909 if (base64bits)
1910 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1911 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001912 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001913
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001914 if (_PyString_Resize(&v, out - start))
1915 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001916 return v;
1917}
1918
Antoine Pitrou653dece2009-05-04 18:32:32 +00001919#undef IS_BASE64
1920#undef FROM_BASE64
1921#undef TO_BASE64
1922#undef DECODE_DIRECT
1923#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001924
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925/* --- UTF-8 Codec -------------------------------------------------------- */
1926
Tim Petersced69f82003-09-16 20:30:58 +00001927static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001929 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1930 illegal prefix. See RFC 3629 for details */
1931 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1932 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001933 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1935 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1936 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1937 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001938 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1939 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1941 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001942 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1943 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1944 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1945 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1946 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947};
1948
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001950 Py_ssize_t size,
1951 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952{
Walter Dörwald69652032004-09-07 20:24:22 +00001953 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1954}
1955
1956PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001957 Py_ssize_t size,
1958 const char *errors,
1959 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001960{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001961 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001963 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001964 Py_ssize_t startinpos;
1965 Py_ssize_t endinpos;
1966 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 const char *e;
1968 PyUnicodeObject *unicode;
1969 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001970 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001971 PyObject *errorHandler = NULL;
1972 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973
1974 /* Note: size will always be longer than the resulting Unicode
1975 character count */
1976 unicode = _PyUnicode_New(size);
1977 if (!unicode)
1978 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001979 if (size == 0) {
1980 if (consumed)
1981 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984
1985 /* Unpack UTF-8 encoded data */
1986 p = unicode->str;
1987 e = s + size;
1988
1989 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001990 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991
1992 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001993 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 s++;
1995 continue;
1996 }
1997
1998 n = utf8_code_length[ch];
1999
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002000 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002001 if (consumed)
2002 break;
2003 else {
2004 errmsg = "unexpected end of data";
2005 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002006 endinpos = startinpos+1;
2007 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2008 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002009 goto utf8Error;
2010 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002012
2013 switch (n) {
2014
2015 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002016 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002017 startinpos = s-starts;
2018 endinpos = startinpos+1;
2019 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002020
2021 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002022 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002023 startinpos = s-starts;
2024 endinpos = startinpos+1;
2025 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026
2027 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002028 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00002029 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002030 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002031 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002032 goto utf8Error;
2033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002035 assert ((ch > 0x007F) && (ch <= 0x07FF));
2036 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 break;
2038
2039 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002040 /* XXX: surrogates shouldn't be valid UTF-8!
2041 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2042 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2043 Uncomment the 2 lines below to make them invalid,
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002044 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00002045 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002046 (s[2] & 0xc0) != 0x80 ||
2047 ((unsigned char)s[0] == 0xE0 &&
2048 (unsigned char)s[1] < 0xA0)/* ||
2049 ((unsigned char)s[0] == 0xED &&
2050 (unsigned char)s[1] > 0x9F)*/) {
2051 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002052 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002053 endinpos = startinpos + 1;
2054
2055 /* if s[1] first two bits are 1 and 0, then the invalid
2056 continuation byte is s[2], so increment endinpos by 1,
2057 if not, s[1] is invalid and endinpos doesn't need to
2058 be incremented. */
2059 if ((s[1] & 0xC0) == 0x80)
2060 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002061 goto utf8Error;
2062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002064 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2065 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002066 break;
2067
2068 case 4:
2069 if ((s[1] & 0xc0) != 0x80 ||
2070 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002071 (s[3] & 0xc0) != 0x80 ||
2072 ((unsigned char)s[0] == 0xF0 &&
2073 (unsigned char)s[1] < 0x90) ||
2074 ((unsigned char)s[0] == 0xF4 &&
2075 (unsigned char)s[1] > 0x8F)) {
2076 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002077 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002078 endinpos = startinpos + 1;
2079 if ((s[1] & 0xC0) == 0x80) {
2080 endinpos++;
2081 if ((s[2] & 0xC0) == 0x80)
2082 endinpos++;
2083 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002084 goto utf8Error;
2085 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002086 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002087 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2088 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2089
Fredrik Lundh8f455852001-06-27 18:59:43 +00002090#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002091 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002092#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002093 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002094
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002095 /* translate from 10000..10FFFF to 0..FFFF */
2096 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002097
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002098 /* high surrogate = top 10 bits added to D800 */
2099 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002100
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002101 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002102 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002103#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105 }
2106 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002107 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002108
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002109 utf8Error:
2110 outpos = p-PyUnicode_AS_UNICODE(unicode);
2111 if (unicode_decode_call_errorhandler(
2112 errors, &errorHandler,
2113 "utf8", errmsg,
2114 starts, size, &startinpos, &endinpos, &exc, &s,
2115 &unicode, &outpos, &p))
2116 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 }
Walter Dörwald69652032004-09-07 20:24:22 +00002118 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002119 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120
2121 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002122 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 goto onError;
2124
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 Py_XDECREF(errorHandler);
2126 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 return (PyObject *)unicode;
2128
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002129 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 Py_XDECREF(errorHandler);
2131 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 Py_DECREF(unicode);
2133 return NULL;
2134}
2135
Tim Peters602f7402002-04-27 18:03:26 +00002136/* Allocation strategy: if the string is short, convert into a stack buffer
2137 and allocate exactly as much space needed at the end. Else allocate the
2138 maximum possible needed (4 result bytes per Unicode character), and return
2139 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002140*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002141PyObject *
2142PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002143 Py_ssize_t size,
2144 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145{
Tim Peters602f7402002-04-27 18:03:26 +00002146#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002147
Martin v. Löwis18e16552006-02-15 17:27:45 +00002148 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002149 PyObject *v; /* result string object */
2150 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002151 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002152 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002153 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002154
Tim Peters602f7402002-04-27 18:03:26 +00002155 assert(s != NULL);
2156 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157
Tim Peters602f7402002-04-27 18:03:26 +00002158 if (size <= MAX_SHORT_UNICHARS) {
2159 /* Write into the stack buffer; nallocated can't overflow.
2160 * At the end, we'll allocate exactly as much heap space as it
2161 * turns out we need.
2162 */
2163 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2164 v = NULL; /* will allocate after we're done */
2165 p = stackbuf;
2166 }
2167 else {
2168 /* Overallocate on the heap, and give the excess back at the end. */
2169 nallocated = size * 4;
2170 if (nallocated / 4 != size) /* overflow! */
2171 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002172 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002173 if (v == NULL)
2174 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002175 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002176 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002177
Tim Peters602f7402002-04-27 18:03:26 +00002178 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002179 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002180
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002181 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002182 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002184
Guido van Rossumd57fd912000-03-10 22:53:23 +00002185 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002186 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002187 *p++ = (char)(0xc0 | (ch >> 6));
2188 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002189 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002190 else {
Tim Peters602f7402002-04-27 18:03:26 +00002191 /* Encode UCS2 Unicode ordinals */
2192 if (ch < 0x10000) {
2193 /* Special case: check for high surrogate */
2194 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2195 Py_UCS4 ch2 = s[i];
2196 /* Check for low surrogate and combine the two to
2197 form a UCS4 value */
2198 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002199 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002200 i++;
2201 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002202 }
Tim Peters602f7402002-04-27 18:03:26 +00002203 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002204 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002205 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002206 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2207 *p++ = (char)(0x80 | (ch & 0x3f));
2208 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002209 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002210 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002211 /* Encode UCS4 Unicode ordinals */
2212 *p++ = (char)(0xf0 | (ch >> 18));
2213 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2214 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2215 *p++ = (char)(0x80 | (ch & 0x3f));
2216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002218
Tim Peters602f7402002-04-27 18:03:26 +00002219 if (v == NULL) {
2220 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002221 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002222 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002223 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002224 }
2225 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002226 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002227 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002228 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002229 if (_PyString_Resize(&v, nneeded))
2230 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002233
Tim Peters602f7402002-04-27 18:03:26 +00002234#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002235}
2236
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2238{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 if (!PyUnicode_Check(unicode)) {
2240 PyErr_BadArgument();
2241 return NULL;
2242 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002243 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002244 PyUnicode_GET_SIZE(unicode),
2245 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246}
2247
Walter Dörwald6e390802007-08-17 16:41:28 +00002248/* --- UTF-32 Codec ------------------------------------------------------- */
2249
2250PyObject *
2251PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002252 Py_ssize_t size,
2253 const char *errors,
2254 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002255{
2256 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2257}
2258
2259PyObject *
2260PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002261 Py_ssize_t size,
2262 const char *errors,
2263 int *byteorder,
2264 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002265{
2266 const char *starts = s;
2267 Py_ssize_t startinpos;
2268 Py_ssize_t endinpos;
2269 Py_ssize_t outpos;
2270 PyUnicodeObject *unicode;
2271 Py_UNICODE *p;
2272#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002273 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002274 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002275#else
2276 const int pairs = 0;
2277#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002278 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002279 int bo = 0; /* assume native ordering by default */
2280 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002281 /* Offsets from q for retrieving bytes in the right order. */
2282#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2283 int iorder[] = {0, 1, 2, 3};
2284#else
2285 int iorder[] = {3, 2, 1, 0};
2286#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002287 PyObject *errorHandler = NULL;
2288 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002289
Walter Dörwald6e390802007-08-17 16:41:28 +00002290 q = (unsigned char *)s;
2291 e = q + size;
2292
2293 if (byteorder)
2294 bo = *byteorder;
2295
2296 /* Check for BOM marks (U+FEFF) in the input and adjust current
2297 byte order setting accordingly. In native mode, the leading BOM
2298 mark is skipped, in all other modes, it is copied to the output
2299 stream as-is (giving a ZWNBSP character). */
2300 if (bo == 0) {
2301 if (size >= 4) {
2302 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002303 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002304#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002305 if (bom == 0x0000FEFF) {
2306 q += 4;
2307 bo = -1;
2308 }
2309 else if (bom == 0xFFFE0000) {
2310 q += 4;
2311 bo = 1;
2312 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002313#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002314 if (bom == 0x0000FEFF) {
2315 q += 4;
2316 bo = 1;
2317 }
2318 else if (bom == 0xFFFE0000) {
2319 q += 4;
2320 bo = -1;
2321 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002322#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002323 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002324 }
2325
2326 if (bo == -1) {
2327 /* force LE */
2328 iorder[0] = 0;
2329 iorder[1] = 1;
2330 iorder[2] = 2;
2331 iorder[3] = 3;
2332 }
2333 else if (bo == 1) {
2334 /* force BE */
2335 iorder[0] = 3;
2336 iorder[1] = 2;
2337 iorder[2] = 1;
2338 iorder[3] = 0;
2339 }
2340
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002341 /* On narrow builds we split characters outside the BMP into two
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002342 code points => count how much extra space we need. */
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002343#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002344 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002345 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2346 pairs++;
2347#endif
2348
2349 /* This might be one to much, because of a BOM */
2350 unicode = _PyUnicode_New((size+3)/4+pairs);
2351 if (!unicode)
2352 return NULL;
2353 if (size == 0)
2354 return (PyObject *)unicode;
2355
2356 /* Unpack UTF-32 encoded data */
2357 p = unicode->str;
2358
Walter Dörwald6e390802007-08-17 16:41:28 +00002359 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002360 Py_UCS4 ch;
2361 /* remaining bytes at the end? (size should be divisible by 4) */
2362 if (e-q<4) {
2363 if (consumed)
2364 break;
2365 errmsg = "truncated data";
2366 startinpos = ((const char *)q)-starts;
2367 endinpos = ((const char *)e)-starts;
2368 goto utf32Error;
2369 /* The remaining input chars are ignored if the callback
2370 chooses to skip the input */
2371 }
2372 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2373 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002374
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002375 if (ch >= 0x110000)
2376 {
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002377 errmsg = "code point not in range(0x110000)";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002378 startinpos = ((const char *)q)-starts;
2379 endinpos = startinpos+4;
2380 goto utf32Error;
2381 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002382#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002383 if (ch >= 0x10000)
2384 {
2385 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2386 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2387 }
2388 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002389#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002390 *p++ = ch;
2391 q += 4;
2392 continue;
2393 utf32Error:
2394 outpos = p-PyUnicode_AS_UNICODE(unicode);
2395 if (unicode_decode_call_errorhandler(
2396 errors, &errorHandler,
2397 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002398 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002399 &unicode, &outpos, &p))
2400 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002401 }
2402
2403 if (byteorder)
2404 *byteorder = bo;
2405
2406 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002407 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002408
2409 /* Adjust length */
2410 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2411 goto onError;
2412
2413 Py_XDECREF(errorHandler);
2414 Py_XDECREF(exc);
2415 return (PyObject *)unicode;
2416
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002417 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002418 Py_DECREF(unicode);
2419 Py_XDECREF(errorHandler);
2420 Py_XDECREF(exc);
2421 return NULL;
2422}
2423
2424PyObject *
2425PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002426 Py_ssize_t size,
2427 const char *errors,
2428 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002429{
2430 PyObject *v;
2431 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002432 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002433#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002434 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002435#else
2436 const int pairs = 0;
2437#endif
2438 /* Offsets from p for storing byte pairs in the right order. */
2439#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2440 int iorder[] = {0, 1, 2, 3};
2441#else
2442 int iorder[] = {3, 2, 1, 0};
2443#endif
2444
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002445#define STORECHAR(CH) \
2446 do { \
2447 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2448 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2449 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2450 p[iorder[0]] = (CH) & 0xff; \
2451 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002452 } while(0)
2453
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002454 /* In narrow builds we can output surrogate pairs as one code point,
Walter Dörwald6e390802007-08-17 16:41:28 +00002455 so we need less space. */
2456#ifndef Py_UNICODE_WIDE
2457 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002458 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2459 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2460 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002461#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002462 nsize = (size - pairs + (byteorder == 0));
2463 bytesize = nsize * 4;
2464 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002465 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002466 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002467 if (v == NULL)
2468 return NULL;
2469
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002470 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002471 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002472 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002473 if (size == 0)
2474 return v;
2475
2476 if (byteorder == -1) {
2477 /* force LE */
2478 iorder[0] = 0;
2479 iorder[1] = 1;
2480 iorder[2] = 2;
2481 iorder[3] = 3;
2482 }
2483 else if (byteorder == 1) {
2484 /* force BE */
2485 iorder[0] = 3;
2486 iorder[1] = 2;
2487 iorder[2] = 1;
2488 iorder[3] = 0;
2489 }
2490
2491 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002492 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002493#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002494 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2495 Py_UCS4 ch2 = *s;
2496 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2497 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2498 s++;
2499 size--;
2500 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002501 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002502#endif
2503 STORECHAR(ch);
2504 }
2505 return v;
2506#undef STORECHAR
2507}
2508
2509PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2510{
2511 if (!PyUnicode_Check(unicode)) {
2512 PyErr_BadArgument();
2513 return NULL;
2514 }
2515 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002516 PyUnicode_GET_SIZE(unicode),
2517 NULL,
2518 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002519}
2520
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521/* --- UTF-16 Codec ------------------------------------------------------- */
2522
Tim Peters772747b2001-08-09 22:21:55 +00002523PyObject *
2524PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002525 Py_ssize_t size,
2526 const char *errors,
2527 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528{
Walter Dörwald69652032004-09-07 20:24:22 +00002529 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2530}
2531
2532PyObject *
2533PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002534 Py_ssize_t size,
2535 const char *errors,
2536 int *byteorder,
2537 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002538{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002539 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002540 Py_ssize_t startinpos;
2541 Py_ssize_t endinpos;
2542 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 PyUnicodeObject *unicode;
2544 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002545 const unsigned char *q, *e;
2546 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002547 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002548 /* Offsets from q for retrieving byte pairs in the right order. */
2549#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2550 int ihi = 1, ilo = 0;
2551#else
2552 int ihi = 0, ilo = 1;
2553#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002554 PyObject *errorHandler = NULL;
2555 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556
2557 /* Note: size will always be longer than the resulting Unicode
2558 character count */
2559 unicode = _PyUnicode_New(size);
2560 if (!unicode)
2561 return NULL;
2562 if (size == 0)
2563 return (PyObject *)unicode;
2564
2565 /* Unpack UTF-16 encoded data */
2566 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002567 q = (unsigned char *)s;
2568 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569
2570 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002571 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002573 /* Check for BOM marks (U+FEFF) in the input and adjust current
2574 byte order setting accordingly. In native mode, the leading BOM
2575 mark is skipped, in all other modes, it is copied to the output
2576 stream as-is (giving a ZWNBSP character). */
2577 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002578 if (size >= 2) {
2579 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002580#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002581 if (bom == 0xFEFF) {
2582 q += 2;
2583 bo = -1;
2584 }
2585 else if (bom == 0xFFFE) {
2586 q += 2;
2587 bo = 1;
2588 }
Tim Petersced69f82003-09-16 20:30:58 +00002589#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002590 if (bom == 0xFEFF) {
2591 q += 2;
2592 bo = 1;
2593 }
2594 else if (bom == 0xFFFE) {
2595 q += 2;
2596 bo = -1;
2597 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002598#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002599 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002600 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601
Tim Peters772747b2001-08-09 22:21:55 +00002602 if (bo == -1) {
2603 /* force LE */
2604 ihi = 1;
2605 ilo = 0;
2606 }
2607 else if (bo == 1) {
2608 /* force BE */
2609 ihi = 0;
2610 ilo = 1;
2611 }
2612
2613 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002614 Py_UNICODE ch;
2615 /* remaining bytes at the end? (size should be even) */
2616 if (e-q<2) {
2617 if (consumed)
2618 break;
2619 errmsg = "truncated data";
2620 startinpos = ((const char *)q)-starts;
2621 endinpos = ((const char *)e)-starts;
2622 goto utf16Error;
2623 /* The remaining input chars are ignored if the callback
2624 chooses to skip the input */
2625 }
2626 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627
Benjamin Peterson857ce152009-01-31 16:29:18 +00002628 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002629
2630 if (ch < 0xD800 || ch > 0xDFFF) {
2631 *p++ = ch;
2632 continue;
2633 }
2634
2635 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002636 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002637 q -= 2;
2638 if (consumed)
2639 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002640 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002641 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002642 endinpos = ((const char *)e)-starts;
2643 goto utf16Error;
2644 }
2645 if (0xD800 <= ch && ch <= 0xDBFF) {
2646 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2647 q += 2;
2648 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002649#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002650 *p++ = ch;
2651 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002652#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002653 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002654#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002655 continue;
2656 }
2657 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002658 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002659 startinpos = (((const char *)q)-4)-starts;
2660 endinpos = startinpos+2;
2661 goto utf16Error;
2662 }
2663
Benjamin Peterson857ce152009-01-31 16:29:18 +00002664 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002665 errmsg = "illegal encoding";
2666 startinpos = (((const char *)q)-2)-starts;
2667 endinpos = startinpos+2;
2668 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002669
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002670 utf16Error:
2671 outpos = p-PyUnicode_AS_UNICODE(unicode);
2672 if (unicode_decode_call_errorhandler(
2673 errors, &errorHandler,
2674 "utf16", errmsg,
2675 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2676 &unicode, &outpos, &p))
2677 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 }
2679
2680 if (byteorder)
2681 *byteorder = bo;
2682
Walter Dörwald69652032004-09-07 20:24:22 +00002683 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002684 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002685
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002687 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688 goto onError;
2689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002690 Py_XDECREF(errorHandler);
2691 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 return (PyObject *)unicode;
2693
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002694 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002696 Py_XDECREF(errorHandler);
2697 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 return NULL;
2699}
2700
Tim Peters772747b2001-08-09 22:21:55 +00002701PyObject *
2702PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002703 Py_ssize_t size,
2704 const char *errors,
2705 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706{
2707 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002708 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002709 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002710#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002711 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002712#else
2713 const int pairs = 0;
2714#endif
Tim Peters772747b2001-08-09 22:21:55 +00002715 /* Offsets from p for storing byte pairs in the right order. */
2716#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2717 int ihi = 1, ilo = 0;
2718#else
2719 int ihi = 0, ilo = 1;
2720#endif
2721
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002722#define STORECHAR(CH) \
2723 do { \
2724 p[ihi] = ((CH) >> 8) & 0xff; \
2725 p[ilo] = (CH) & 0xff; \
2726 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002727 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002729#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002730 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002731 if (s[i] >= 0x10000)
2732 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002733#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002734 /* 2 * (size + pairs + (byteorder == 0)) */
2735 if (size > PY_SSIZE_T_MAX ||
2736 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002737 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002738 nsize = size + pairs + (byteorder == 0);
2739 bytesize = nsize * 2;
2740 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002741 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002742 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 if (v == NULL)
2744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002746 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002748 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002749 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002750 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002751
2752 if (byteorder == -1) {
2753 /* force LE */
2754 ihi = 1;
2755 ilo = 0;
2756 }
2757 else if (byteorder == 1) {
2758 /* force BE */
2759 ihi = 0;
2760 ilo = 1;
2761 }
2762
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002763 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002764 Py_UNICODE ch = *s++;
2765 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002766#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002767 if (ch >= 0x10000) {
2768 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2769 ch = 0xD800 | ((ch-0x10000) >> 10);
2770 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002771#endif
Tim Peters772747b2001-08-09 22:21:55 +00002772 STORECHAR(ch);
2773 if (ch2)
2774 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002777#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778}
2779
2780PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2781{
2782 if (!PyUnicode_Check(unicode)) {
2783 PyErr_BadArgument();
2784 return NULL;
2785 }
2786 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002787 PyUnicode_GET_SIZE(unicode),
2788 NULL,
2789 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790}
2791
2792/* --- Unicode Escape Codec ----------------------------------------------- */
2793
Fredrik Lundh06d12682001-01-24 07:59:11 +00002794static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002795
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002797 Py_ssize_t size,
2798 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002801 Py_ssize_t startinpos;
2802 Py_ssize_t endinpos;
2803 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002805 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002807 char* message;
2808 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002809 PyObject *errorHandler = NULL;
2810 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002811
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812 /* Escaped strings will always be longer than the resulting
2813 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 length after conversion to the true value.
2815 (but if the error callback returns a long replacement string
2816 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 v = _PyUnicode_New(size);
2818 if (v == NULL)
2819 goto onError;
2820 if (size == 0)
2821 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002822
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002825
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 while (s < end) {
2827 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002828 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830
2831 /* Non-escape characters are interpreted as Unicode ordinals */
2832 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002833 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 continue;
2835 }
2836
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002837 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 /* \ - Escapes */
2839 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002840 c = *s++;
2841 if (s > end)
2842 c = '\0'; /* Invalid after \ */
2843 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002845 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 case '\n': break;
2847 case '\\': *p++ = '\\'; break;
2848 case '\'': *p++ = '\''; break;
2849 case '\"': *p++ = '\"'; break;
2850 case 'b': *p++ = '\b'; break;
2851 case 'f': *p++ = '\014'; break; /* FF */
2852 case 't': *p++ = '\t'; break;
2853 case 'n': *p++ = '\n'; break;
2854 case 'r': *p++ = '\r'; break;
2855 case 'v': *p++ = '\013'; break; /* VT */
2856 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2857
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002858 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 case '0': case '1': case '2': case '3':
2860 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002861 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002862 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002863 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002864 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002865 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002867 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 break;
2869
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002870 /* hex escapes */
2871 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002873 digits = 2;
2874 message = "truncated \\xXX escape";
2875 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002877 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002879 digits = 4;
2880 message = "truncated \\uXXXX escape";
2881 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002883 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002884 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002885 digits = 8;
2886 message = "truncated \\UXXXXXXXX escape";
2887 hexescape:
2888 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002889 if (end - s < digits) {
2890 /* count only hex digits */
2891 for (; s < end; ++s) {
2892 c = (unsigned char)*s;
2893 if (!Py_ISXDIGIT(c))
2894 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002895 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002896 goto error;
2897 }
2898 for (; digits--; ++s) {
2899 c = (unsigned char)*s;
2900 if (!Py_ISXDIGIT(c))
2901 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002902 chr = (chr<<4) & ~0xF;
2903 if (c >= '0' && c <= '9')
2904 chr += c - '0';
2905 else if (c >= 'a' && c <= 'f')
2906 chr += 10 + c - 'a';
2907 else
2908 chr += 10 + c - 'A';
2909 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002910 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002911 /* _decoding_error will have already written into the
2912 target buffer. */
2913 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002914 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002915 /* when we get here, chr is a 32-bit unicode character */
2916 if (chr <= 0xffff)
2917 /* UCS-2 character */
2918 *p++ = (Py_UNICODE) chr;
2919 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002920 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002921 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002922#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002923 *p++ = chr;
2924#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002925 chr -= 0x10000L;
2926 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002927 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002928#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002929 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002930 message = "illegal Unicode character";
2931 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002932 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002933 break;
2934
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002935 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002936 case 'N':
2937 message = "malformed \\N character escape";
2938 if (ucnhash_CAPI == NULL) {
2939 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002940 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002941 if (ucnhash_CAPI == NULL)
2942 goto ucnhashError;
2943 }
2944 if (*s == '{') {
2945 const char *start = s+1;
2946 /* look for the closing brace */
2947 while (*s != '}' && s < end)
2948 s++;
2949 if (s > start && s < end && *s == '}') {
2950 /* found a name. look it up in the unicode database */
2951 message = "unknown Unicode character name";
2952 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002953 if (s - start - 1 <= INT_MAX &&
2954 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002955 goto store;
2956 }
2957 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002958 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002959
2960 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002961 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002962 message = "\\ at end of string";
2963 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002964 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002965 }
2966 else {
2967 *p++ = '\\';
2968 *p++ = (unsigned char)s[-1];
2969 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002970 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002972 continue;
2973
2974 error:
2975 endinpos = s-starts;
2976 outpos = p-PyUnicode_AS_UNICODE(v);
2977 if (unicode_decode_call_errorhandler(
2978 errors, &errorHandler,
2979 "unicodeescape", message,
2980 starts, size, &startinpos, &endinpos, &exc, &s,
2981 &v, &outpos, &p))
2982 goto onError;
2983 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002985 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002986 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002987 Py_XDECREF(errorHandler);
2988 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002990
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002991 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002992 PyErr_SetString(
2993 PyExc_UnicodeError,
2994 "\\N escapes not supported (can't load unicodedata module)"
2995 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002996 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002997 Py_XDECREF(errorHandler);
2998 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002999 return NULL;
3000
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003001 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003003 Py_XDECREF(errorHandler);
3004 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005 return NULL;
3006}
3007
3008/* Return a Unicode-Escape string version of the Unicode object.
3009
3010 If quotes is true, the string is enclosed in u"" or u'' quotes as
3011 appropriate.
3012
3013*/
3014
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00003015Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003016 Py_ssize_t size,
3017 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00003018{
3019 /* like wcschr, but doesn't stop at NULL characters */
3020
3021 while (size-- > 0) {
3022 if (*s == ch)
3023 return s;
3024 s++;
3025 }
3026
3027 return NULL;
3028}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003029
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030static
3031PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003032 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 int quotes)
3034{
3035 PyObject *repr;
3036 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003038 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00003039#ifdef Py_UNICODE_WIDE
3040 const Py_ssize_t expandsize = 10;
3041#else
3042 const Py_ssize_t expandsize = 6;
3043#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044
Neal Norwitz17753ec2006-08-21 22:21:19 +00003045 /* XXX(nnorwitz): rather than over-allocating, it would be
3046 better to choose a different scheme. Perhaps scan the
3047 first N-chars of the string and allocate based on that size.
3048 */
3049 /* Initial allocation is based on the longest-possible unichr
3050 escape.
3051
3052 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3053 unichr, so in this case it's the longest unichr escape. In
3054 narrow (UTF-16) builds this is five chars per source unichr
3055 since there are two unichrs in the surrogate pair, so in narrow
3056 (UTF-16) builds it's not the longest unichr escape.
3057
3058 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3059 so in the narrow (UTF-16) build case it's the longest unichr
3060 escape.
3061 */
3062
Neal Norwitze7d8be82008-07-31 17:17:14 +00003063 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003064 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003065
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003066 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003067 2
3068 + expandsize*size
3069 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 if (repr == NULL)
3071 return NULL;
3072
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003073 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074
3075 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003077 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 !findchar(s, size, '"')) ? '"' : '\'';
3079 }
3080 while (size-- > 0) {
3081 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003082
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003083 /* Escape quotes and backslashes */
3084 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003085 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086 *p++ = '\\';
3087 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003088 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003089 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003090
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003091#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003092 /* Map 21-bit characters to '\U00xxxxxx' */
3093 else if (ch >= 0x10000) {
3094 *p++ = '\\';
3095 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003096 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3097 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3098 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3099 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3100 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3101 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3102 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003103 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003104 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003105 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003106#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003107 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3108 else if (ch >= 0xD800 && ch < 0xDC00) {
3109 Py_UNICODE ch2;
3110 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003111
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003112 ch2 = *s++;
3113 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003114 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003115 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3116 *p++ = '\\';
3117 *p++ = 'U';
3118 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3119 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3120 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3121 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3122 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3123 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3124 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3125 *p++ = hexdigit[ucs & 0x0000000F];
3126 continue;
3127 }
3128 /* Fall through: isolated surrogates are copied as-is */
3129 s--;
3130 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003131 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003132#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003133
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003135 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 *p++ = '\\';
3137 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003138 *p++ = hexdigit[(ch >> 12) & 0x000F];
3139 *p++ = hexdigit[(ch >> 8) & 0x000F];
3140 *p++ = hexdigit[(ch >> 4) & 0x000F];
3141 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003142 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003143
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003144 /* Map special whitespace to '\t', \n', '\r' */
3145 else if (ch == '\t') {
3146 *p++ = '\\';
3147 *p++ = 't';
3148 }
3149 else if (ch == '\n') {
3150 *p++ = '\\';
3151 *p++ = 'n';
3152 }
3153 else if (ch == '\r') {
3154 *p++ = '\\';
3155 *p++ = 'r';
3156 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003157
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003158 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003159 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003161 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003162 *p++ = hexdigit[(ch >> 4) & 0x000F];
3163 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003164 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003165
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 /* Copy everything else as-is */
3167 else
3168 *p++ = (char) ch;
3169 }
3170 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003171 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172
3173 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003174 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 return repr;
3177}
3178
3179PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003180 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181{
3182 return unicodeescape_string(s, size, 0);
3183}
3184
3185PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3186{
3187 if (!PyUnicode_Check(unicode)) {
3188 PyErr_BadArgument();
3189 return NULL;
3190 }
3191 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003192 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193}
3194
3195/* --- Raw Unicode Escape Codec ------------------------------------------- */
3196
3197PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003198 Py_ssize_t size,
3199 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003201 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003202 Py_ssize_t startinpos;
3203 Py_ssize_t endinpos;
3204 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003206 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 const char *end;
3208 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003209 PyObject *errorHandler = NULL;
3210 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003211
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 /* Escaped strings will always be longer than the resulting
3213 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003214 length after conversion to the true value. (But decoding error
3215 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 v = _PyUnicode_New(size);
3217 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003218 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003220 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 end = s + size;
3223 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003224 unsigned char c;
3225 Py_UCS4 x;
3226 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003227 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003229 /* Non-escape characters are interpreted as Unicode ordinals */
3230 if (*s != '\\') {
3231 *p++ = (unsigned char)*s++;
3232 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003233 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003234 startinpos = s-starts;
3235
3236 /* \u-escapes are only interpreted iff the number of leading
3237 backslashes if odd */
3238 bs = s;
3239 for (;s < end;) {
3240 if (*s != '\\')
3241 break;
3242 *p++ = (unsigned char)*s++;
3243 }
3244 if (((s - bs) & 1) == 0 ||
3245 s >= end ||
3246 (*s != 'u' && *s != 'U')) {
3247 continue;
3248 }
3249 p--;
3250 count = *s=='u' ? 4 : 8;
3251 s++;
3252
3253 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3254 outpos = p-PyUnicode_AS_UNICODE(v);
3255 for (x = 0, i = 0; i < count; ++i, ++s) {
3256 c = (unsigned char)*s;
3257 if (!isxdigit(c)) {
3258 endinpos = s-starts;
3259 if (unicode_decode_call_errorhandler(
3260 errors, &errorHandler,
3261 "rawunicodeescape", "truncated \\uXXXX",
3262 starts, size, &startinpos, &endinpos, &exc, &s,
3263 &v, &outpos, &p))
3264 goto onError;
3265 goto nextByte;
3266 }
3267 x = (x<<4) & ~0xF;
3268 if (c >= '0' && c <= '9')
3269 x += c - '0';
3270 else if (c >= 'a' && c <= 'f')
3271 x += 10 + c - 'a';
3272 else
3273 x += 10 + c - 'A';
3274 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003275 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003276 /* UCS-2 character */
3277 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003278 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003279 /* UCS-4 character. Either store directly, or as
3280 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003281#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003282 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003283#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003284 x -= 0x10000L;
3285 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3286 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003287#endif
3288 } else {
3289 endinpos = s-starts;
3290 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003291 if (unicode_decode_call_errorhandler(
3292 errors, &errorHandler,
3293 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003294 starts, size, &startinpos, &endinpos, &exc, &s,
3295 &v, &outpos, &p))
3296 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003297 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003298 nextByte:
3299 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003301 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003302 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003303 Py_XDECREF(errorHandler);
3304 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003306
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003307 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003309 Py_XDECREF(errorHandler);
3310 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 return NULL;
3312}
3313
3314PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003315 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316{
3317 PyObject *repr;
3318 char *p;
3319 char *q;
3320
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003321 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003322#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003323 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003324#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003325 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003326#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003327
Neal Norwitze7d8be82008-07-31 17:17:14 +00003328 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003329 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003330
Neal Norwitze7d8be82008-07-31 17:17:14 +00003331 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 if (repr == NULL)
3333 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003334 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003335 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003337 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 while (size-- > 0) {
3339 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003340#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003341 /* Map 32-bit characters to '\Uxxxxxxxx' */
3342 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003343 *p++ = '\\';
3344 *p++ = 'U';
3345 *p++ = hexdigit[(ch >> 28) & 0xf];
3346 *p++ = hexdigit[(ch >> 24) & 0xf];
3347 *p++ = hexdigit[(ch >> 20) & 0xf];
3348 *p++ = hexdigit[(ch >> 16) & 0xf];
3349 *p++ = hexdigit[(ch >> 12) & 0xf];
3350 *p++ = hexdigit[(ch >> 8) & 0xf];
3351 *p++ = hexdigit[(ch >> 4) & 0xf];
3352 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003353 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003354 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003355#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003356 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3357 if (ch >= 0xD800 && ch < 0xDC00) {
3358 Py_UNICODE ch2;
3359 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003360
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003361 ch2 = *s++;
3362 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003363 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003364 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3365 *p++ = '\\';
3366 *p++ = 'U';
3367 *p++ = hexdigit[(ucs >> 28) & 0xf];
3368 *p++ = hexdigit[(ucs >> 24) & 0xf];
3369 *p++ = hexdigit[(ucs >> 20) & 0xf];
3370 *p++ = hexdigit[(ucs >> 16) & 0xf];
3371 *p++ = hexdigit[(ucs >> 12) & 0xf];
3372 *p++ = hexdigit[(ucs >> 8) & 0xf];
3373 *p++ = hexdigit[(ucs >> 4) & 0xf];
3374 *p++ = hexdigit[ucs & 0xf];
3375 continue;
3376 }
3377 /* Fall through: isolated surrogates are copied as-is */
3378 s--;
3379 size++;
3380 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003381#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003382 /* Map 16-bit characters to '\uxxxx' */
3383 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384 *p++ = '\\';
3385 *p++ = 'u';
3386 *p++ = hexdigit[(ch >> 12) & 0xf];
3387 *p++ = hexdigit[(ch >> 8) & 0xf];
3388 *p++ = hexdigit[(ch >> 4) & 0xf];
3389 *p++ = hexdigit[ch & 15];
3390 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003391 /* Copy everything else as-is */
3392 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 *p++ = (char) ch;
3394 }
3395 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003396 if (_PyString_Resize(&repr, p - q))
3397 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003398 return repr;
3399}
3400
3401PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3402{
3403 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003404 PyErr_BadArgument();
3405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406 }
3407 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003408 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409}
3410
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003411/* --- Unicode Internal Codec ------------------------------------------- */
3412
3413PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003414 Py_ssize_t size,
3415 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003416{
3417 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003418 Py_ssize_t startinpos;
3419 Py_ssize_t endinpos;
3420 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003421 PyUnicodeObject *v;
3422 Py_UNICODE *p;
3423 const char *end;
3424 const char *reason;
3425 PyObject *errorHandler = NULL;
3426 PyObject *exc = NULL;
3427
Neal Norwitzd43069c2006-01-08 01:12:10 +00003428#ifdef Py_UNICODE_WIDE
3429 Py_UNICODE unimax = PyUnicode_GetMax();
3430#endif
3431
Armin Rigo7ccbca92006-10-04 12:17:45 +00003432 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003433 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3434 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003435 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003436 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003437 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003438 p = PyUnicode_AS_UNICODE(v);
3439 end = s + size;
3440
3441 while (s < end) {
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003442 if (end-s < Py_UNICODE_SIZE) {
3443 endinpos = end-starts;
3444 reason = "truncated input";
3445 goto error;
3446 }
Neal Norwitz1004a532006-05-15 07:17:23 +00003447 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003448#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003449 /* We have to sanity check the raw data, otherwise doom looms for
3450 some malformed UCS-4 data. */
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003451 if (*p > unimax || *p < 0) {
3452 endinpos = s - starts + Py_UNICODE_SIZE;
3453 reason = "illegal code point (> 0x10FFFF)";
3454 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003455 }
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003456#endif
3457 p++;
3458 s += Py_UNICODE_SIZE;
3459 continue;
3460
3461 error:
3462 startinpos = s - starts;
3463 outpos = p - PyUnicode_AS_UNICODE(v);
3464 if (unicode_decode_call_errorhandler(
3465 errors, &errorHandler,
3466 "unicode_internal", reason,
3467 starts, size, &startinpos, &endinpos, &exc, &s,
3468 &v, &outpos, &p)) {
3469 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003470 }
3471 }
3472
Martin v. Löwis412fb672006-04-13 06:34:32 +00003473 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003474 goto onError;
3475 Py_XDECREF(errorHandler);
3476 Py_XDECREF(exc);
3477 return (PyObject *)v;
3478
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003479 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003480 Py_XDECREF(v);
3481 Py_XDECREF(errorHandler);
3482 Py_XDECREF(exc);
3483 return NULL;
3484}
3485
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486/* --- Latin-1 Codec ------------------------------------------------------ */
3487
3488PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003489 Py_ssize_t size,
3490 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491{
3492 PyUnicodeObject *v;
3493 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003494
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003496 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003497 Py_UNICODE r = *(unsigned char*)s;
3498 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003499 }
3500
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 v = _PyUnicode_New(size);
3502 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003503 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003505 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506 p = PyUnicode_AS_UNICODE(v);
3507 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003508 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003509 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003510
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003511 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003512 Py_XDECREF(v);
3513 return NULL;
3514}
3515
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516/* create or adjust a UnicodeEncodeError */
3517static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003518 const char *encoding,
3519 const Py_UNICODE *unicode, Py_ssize_t size,
3520 Py_ssize_t startpos, Py_ssize_t endpos,
3521 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003524 *exceptionObject = PyUnicodeEncodeError_Create(
3525 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 }
3527 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003528 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3529 goto onError;
3530 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3531 goto onError;
3532 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3533 goto onError;
3534 return;
3535 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02003536 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 }
3538}
3539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540/* raises a UnicodeEncodeError */
3541static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003542 const char *encoding,
3543 const Py_UNICODE *unicode, Py_ssize_t size,
3544 Py_ssize_t startpos, Py_ssize_t endpos,
3545 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546{
3547 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003548 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003550 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551}
3552
3553/* error handling callback helper:
3554 build arguments, call the callback and check the arguments,
3555 put the result into newpos and return the replacement string, which
3556 has to be freed by the caller */
3557static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003558 PyObject **errorHandler,
3559 const char *encoding, const char *reason,
3560 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3561 Py_ssize_t startpos, Py_ssize_t endpos,
3562 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003564 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565
3566 PyObject *restuple;
3567 PyObject *resunicode;
3568
3569 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003570 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003572 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 }
3574
3575 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003576 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003578 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003579
3580 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003581 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003583 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003585 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003586 Py_DECREF(restuple);
3587 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 }
3589 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003590 &resunicode, newpos)) {
3591 Py_DECREF(restuple);
3592 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 }
3594 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003595 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003596 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003597 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3598 Py_DECREF(restuple);
3599 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003600 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 Py_INCREF(resunicode);
3602 Py_DECREF(restuple);
3603 return resunicode;
3604}
3605
3606static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003607 Py_ssize_t size,
3608 const char *errors,
3609 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610{
3611 /* output object */
3612 PyObject *res;
3613 /* pointers to the beginning and end+1 of input */
3614 const Py_UNICODE *startp = p;
3615 const Py_UNICODE *endp = p + size;
3616 /* pointer to the beginning of the unencodable characters */
3617 /* const Py_UNICODE *badp = NULL; */
3618 /* pointer into the output */
3619 char *str;
3620 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003621 Py_ssize_t respos = 0;
3622 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003623 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3624 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625 PyObject *errorHandler = NULL;
3626 PyObject *exc = NULL;
3627 /* the following variable is used for caching string comparisons
3628 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3629 int known_errorHandler = -1;
3630
3631 /* allocate enough for a simple encoding without
3632 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003633 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 if (res == NULL)
3635 goto onError;
3636 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003637 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003638 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 ressize = size;
3640
3641 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003642 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003644 /* can we encode this? */
3645 if (c<limit) {
3646 /* no overflow check, because we know that the space is enough */
3647 *str++ = (char)c;
3648 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003649 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003650 else {
3651 Py_ssize_t unicodepos = p-startp;
3652 Py_ssize_t requiredsize;
3653 PyObject *repunicode;
3654 Py_ssize_t repsize;
3655 Py_ssize_t newpos;
3656 Py_ssize_t respos;
3657 Py_UNICODE *uni2;
3658 /* startpos for collecting unencodable chars */
3659 const Py_UNICODE *collstart = p;
3660 const Py_UNICODE *collend = p;
3661 /* find all unecodable characters */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003662 while ((collend < endp) && ((*collend) >= limit))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003663 ++collend;
3664 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3665 if (known_errorHandler==-1) {
3666 if ((errors==NULL) || (!strcmp(errors, "strict")))
3667 known_errorHandler = 1;
3668 else if (!strcmp(errors, "replace"))
3669 known_errorHandler = 2;
3670 else if (!strcmp(errors, "ignore"))
3671 known_errorHandler = 3;
3672 else if (!strcmp(errors, "xmlcharrefreplace"))
3673 known_errorHandler = 4;
3674 else
3675 known_errorHandler = 0;
3676 }
3677 switch (known_errorHandler) {
3678 case 1: /* strict */
3679 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3680 goto onError;
3681 case 2: /* replace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003682 while (collstart++ < collend)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003683 *str++ = '?'; /* fall through */
3684 case 3: /* ignore */
3685 p = collend;
3686 break;
3687 case 4: /* xmlcharrefreplace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003688 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003689 /* determine replacement size (temporarily (mis)uses p) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003690 requiredsize = respos;
3691 for (p = collstart; p < collend;) {
Serhiy Storchakae822b032013-08-06 16:56:26 +03003692 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003693 Py_ssize_t incr;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003694 if (ch < 10)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003695 incr = 2+1+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003696 else if (ch < 100)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003697 incr = 2+2+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003698 else if (ch < 1000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003699 incr = 2+3+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003700 else if (ch < 10000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003701 incr = 2+4+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003702 else if (ch < 100000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003703 incr = 2+5+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003704 else if (ch < 1000000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003705 incr = 2+6+1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003706 else
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003707 incr = 2+7+1;
3708 if (requiredsize > PY_SSIZE_T_MAX - incr)
3709 goto overflow;
3710 requiredsize += incr;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003711 }
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003712 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3713 goto overflow;
3714 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003715 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003716 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003717 requiredsize = 2*ressize;
3718 if (_PyString_Resize(&res, requiredsize))
3719 goto onError;
3720 str = PyString_AS_STRING(res) + respos;
3721 ressize = requiredsize;
3722 }
3723 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03003724 for (p = collstart; p < collend;) {
3725 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3726 str += sprintf(str, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003727 }
3728 p = collend;
3729 break;
3730 default:
3731 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3732 encoding, reason, startp, size, &exc,
3733 collstart-startp, collend-startp, &newpos);
3734 if (repunicode == NULL)
3735 goto onError;
3736 /* need more space? (at least enough for what we have+the
3737 replacement+the rest of the string, so we won't have to
3738 check space for encodable characters) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003739 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003740 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003741 if (respos > PY_SSIZE_T_MAX - repsize)
3742 goto overflow;
3743 requiredsize = respos + repsize;
3744 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3745 goto overflow;
3746 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003747 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003748 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003749 requiredsize = 2*ressize;
3750 if (_PyString_Resize(&res, requiredsize)) {
3751 Py_DECREF(repunicode);
3752 goto onError;
3753 }
3754 str = PyString_AS_STRING(res) + respos;
3755 ressize = requiredsize;
3756 }
3757 /* check if there is anything unencodable in the replacement
3758 and copy it to the output */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003759 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003760 c = *uni2;
3761 if (c >= limit) {
3762 raise_encode_exception(&exc, encoding, startp, size,
3763 unicodepos, unicodepos+1, reason);
3764 Py_DECREF(repunicode);
3765 goto onError;
3766 }
3767 *str = (char)c;
3768 }
3769 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003770 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003771 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003772 }
3773 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774 /* Resize if we allocated to much */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003775 respos = str - PyString_AS_STRING(res);
3776 if (respos < ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003777 /* If this falls res will be NULL */
3778 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779 Py_XDECREF(errorHandler);
3780 Py_XDECREF(exc);
3781 return res;
3782
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003783 overflow:
3784 PyErr_SetString(PyExc_OverflowError,
3785 "encoded result is too long for a Python string");
3786
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003787 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 Py_XDECREF(res);
3789 Py_XDECREF(errorHandler);
3790 Py_XDECREF(exc);
3791 return NULL;
3792}
3793
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003795 Py_ssize_t size,
3796 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799}
3800
3801PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3802{
3803 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003804 PyErr_BadArgument();
3805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 }
3807 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003808 PyUnicode_GET_SIZE(unicode),
3809 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810}
3811
3812/* --- 7-bit ASCII Codec -------------------------------------------------- */
3813
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003815 Py_ssize_t size,
3816 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 PyUnicodeObject *v;
3820 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003821 Py_ssize_t startinpos;
3822 Py_ssize_t endinpos;
3823 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003824 const char *e;
3825 PyObject *errorHandler = NULL;
3826 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003827
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003829 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003830 Py_UNICODE r = *(unsigned char*)s;
3831 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003832 }
Tim Petersced69f82003-09-16 20:30:58 +00003833
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 v = _PyUnicode_New(size);
3835 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003836 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003838 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840 e = s + size;
3841 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003842 register unsigned char c = (unsigned char)*s;
3843 if (c < 128) {
3844 *p++ = c;
3845 ++s;
3846 }
3847 else {
3848 startinpos = s-starts;
3849 endinpos = startinpos + 1;
3850 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3851 if (unicode_decode_call_errorhandler(
3852 errors, &errorHandler,
3853 "ascii", "ordinal not in range(128)",
3854 starts, size, &startinpos, &endinpos, &exc, &s,
3855 &v, &outpos, &p))
3856 goto onError;
3857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003859 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003860 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3861 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862 Py_XDECREF(errorHandler);
3863 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003865
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003866 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003868 Py_XDECREF(errorHandler);
3869 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 return NULL;
3871}
3872
Guido van Rossumd57fd912000-03-10 22:53:23 +00003873PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003874 Py_ssize_t size,
3875 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878}
3879
3880PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3881{
3882 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003883 PyErr_BadArgument();
3884 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885 }
3886 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003887 PyUnicode_GET_SIZE(unicode),
3888 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889}
3890
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003891#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003892
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003893/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003894
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003895#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003896#define NEED_RETRY
3897#endif
3898
3899/* XXX This code is limited to "true" double-byte encodings, as
3900 a) it assumes an incomplete character consists of a single byte, and
3901 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003902 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003903
3904static int is_dbcs_lead_byte(const char *s, int offset)
3905{
3906 const char *curr = s + offset;
3907
3908 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003909 const char *prev = CharPrev(s, curr);
3910 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003911 }
3912 return 0;
3913}
3914
3915/*
3916 * Decode MBCS string into unicode object. If 'final' is set, converts
3917 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3918 */
3919static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003920 const char *s, /* MBCS string */
3921 int size, /* sizeof MBCS string */
3922 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003923{
3924 Py_UNICODE *p;
3925 Py_ssize_t n = 0;
3926 int usize = 0;
3927
3928 assert(size >= 0);
3929
3930 /* Skip trailing lead-byte unless 'final' is set */
3931 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003932 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003933
3934 /* First get the size of the result */
3935 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003936 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3937 if (usize == 0) {
3938 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3939 return -1;
3940 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003941 }
3942
3943 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003944 /* Create unicode object */
3945 *v = _PyUnicode_New(usize);
3946 if (*v == NULL)
3947 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003948 }
3949 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003950 /* Extend unicode object */
3951 n = PyUnicode_GET_SIZE(*v);
3952 if (_PyUnicode_Resize(v, n + usize) < 0)
3953 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003954 }
3955
3956 /* Do the conversion */
3957 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003958 p = PyUnicode_AS_UNICODE(*v) + n;
3959 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3960 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3961 return -1;
3962 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003963 }
3964
3965 return size;
3966}
3967
3968PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003969 Py_ssize_t size,
3970 const char *errors,
3971 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003972{
3973 PyUnicodeObject *v = NULL;
3974 int done;
3975
3976 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003977 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003978
3979#ifdef NEED_RETRY
3980 retry:
3981 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003982 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003983 else
3984#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003985 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003986
3987 if (done < 0) {
3988 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003989 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003990 }
3991
3992 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003993 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003994
3995#ifdef NEED_RETRY
3996 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003997 s += done;
3998 size -= done;
3999 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004000 }
4001#endif
4002
4003 return (PyObject *)v;
4004}
4005
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004006PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004007 Py_ssize_t size,
4008 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004009{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004010 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4011}
4012
4013/*
4014 * Convert unicode into string object (MBCS).
4015 * Returns 0 if succeed, -1 otherwise.
4016 */
4017static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004018 const Py_UNICODE *p, /* unicode */
4019 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004020{
4021 int mbcssize = 0;
4022 Py_ssize_t n = 0;
4023
4024 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004025
4026 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004027 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004028 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4029 if (mbcssize == 0) {
4030 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4031 return -1;
4032 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004033 }
4034
Martin v. Löwisd8251432006-06-14 05:21:04 +00004035 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004036 /* Create string object */
4037 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4038 if (*repr == NULL)
4039 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004040 }
4041 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004042 /* Extend string object */
4043 n = PyString_Size(*repr);
4044 if (_PyString_Resize(repr, n + mbcssize) < 0)
4045 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004046 }
4047
4048 /* Do the conversion */
4049 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004050 char *s = PyString_AS_STRING(*repr) + n;
4051 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4052 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4053 return -1;
4054 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004055 }
4056
4057 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004058}
4059
4060PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004061 Py_ssize_t size,
4062 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004063{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004064 PyObject *repr = NULL;
4065 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004066
Martin v. Löwisd8251432006-06-14 05:21:04 +00004067#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004068 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004069 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004070 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004071 else
4072#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004073 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004074
Martin v. Löwisd8251432006-06-14 05:21:04 +00004075 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004076 Py_XDECREF(repr);
4077 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004078 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004079
4080#ifdef NEED_RETRY
4081 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004082 p += INT_MAX;
4083 size -= INT_MAX;
4084 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004085 }
4086#endif
4087
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004088 return repr;
4089}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004090
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004091PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4092{
4093 if (!PyUnicode_Check(unicode)) {
4094 PyErr_BadArgument();
4095 return NULL;
4096 }
4097 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004098 PyUnicode_GET_SIZE(unicode),
4099 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004100}
4101
Martin v. Löwisd8251432006-06-14 05:21:04 +00004102#undef NEED_RETRY
4103
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004104#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004105
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106/* --- Character Mapping Codec -------------------------------------------- */
4107
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004109 Py_ssize_t size,
4110 PyObject *mapping,
4111 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004113 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004114 Py_ssize_t startinpos;
4115 Py_ssize_t endinpos;
4116 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 PyUnicodeObject *v;
4119 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004120 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 PyObject *errorHandler = NULL;
4122 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004123 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004124 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004125
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 /* Default to Latin-1 */
4127 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004128 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129
4130 v = _PyUnicode_New(size);
4131 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004132 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004134 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004137 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004138 mapstring = PyUnicode_AS_UNICODE(mapping);
4139 maplen = PyUnicode_GET_SIZE(mapping);
4140 while (s < e) {
4141 unsigned char ch = *s;
4142 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004144 if (ch < maplen)
4145 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004147 if (x == 0xfffe) {
4148 /* undefined mapping */
4149 outpos = p-PyUnicode_AS_UNICODE(v);
4150 startinpos = s-starts;
4151 endinpos = startinpos+1;
4152 if (unicode_decode_call_errorhandler(
4153 errors, &errorHandler,
4154 "charmap", "character maps to <undefined>",
4155 starts, size, &startinpos, &endinpos, &exc, &s,
4156 &v, &outpos, &p)) {
4157 goto onError;
4158 }
4159 continue;
4160 }
4161 *p++ = x;
4162 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004163 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004164 }
4165 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004166 while (s < e) {
4167 unsigned char ch = *s;
4168 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004169
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004170 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4171 w = PyInt_FromLong((long)ch);
4172 if (w == NULL)
4173 goto onError;
4174 x = PyObject_GetItem(mapping, w);
4175 Py_DECREF(w);
4176 if (x == NULL) {
4177 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4178 /* No mapping found means: mapping is undefined. */
4179 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004180 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004181 } else
4182 goto onError;
4183 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004184
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004185 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004186 if (x == Py_None)
4187 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004188 if (PyInt_Check(x)) {
4189 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004190 if (value == 0xFFFE)
4191 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004192 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004193 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004194 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004195 Py_DECREF(x);
4196 goto onError;
4197 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004198
4199#ifndef Py_UNICODE_WIDE
4200 if (value > 0xFFFF) {
4201 /* see the code for 1-n mapping below */
4202 if (extrachars < 2) {
4203 /* resize first */
4204 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4205 Py_ssize_t needed = 10 - extrachars;
4206 extrachars += needed;
4207 /* XXX overflow detection missing */
4208 if (_PyUnicode_Resize(&v,
4209 PyUnicode_GET_SIZE(v) + needed) < 0) {
4210 Py_DECREF(x);
4211 goto onError;
4212 }
4213 p = PyUnicode_AS_UNICODE(v) + oldpos;
4214 }
4215 value -= 0x10000;
4216 *p++ = 0xD800 | (value >> 10);
4217 *p++ = 0xDC00 | (value & 0x3FF);
4218 extrachars -= 2;
4219 }
4220 else
4221#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004222 *p++ = (Py_UNICODE)value;
4223 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004224 else if (PyUnicode_Check(x)) {
4225 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004226
Serhiy Storchaka95997452013-01-15 14:42:59 +02004227 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004228 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004229 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4230 if (value == 0xFFFE)
4231 goto Undefined;
4232 *p++ = value;
4233 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004234 else if (targetsize > 1) {
4235 /* 1-n mapping */
4236 if (targetsize > extrachars) {
4237 /* resize first */
4238 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4239 Py_ssize_t needed = (targetsize - extrachars) + \
4240 (targetsize << 2);
4241 extrachars += needed;
4242 /* XXX overflow detection missing */
4243 if (_PyUnicode_Resize(&v,
4244 PyUnicode_GET_SIZE(v) + needed) < 0) {
4245 Py_DECREF(x);
4246 goto onError;
4247 }
4248 p = PyUnicode_AS_UNICODE(v) + oldpos;
4249 }
4250 Py_UNICODE_COPY(p,
4251 PyUnicode_AS_UNICODE(x),
4252 targetsize);
4253 p += targetsize;
4254 extrachars -= targetsize;
4255 }
4256 /* 1-0 mapping: skip the character */
4257 }
4258 else {
4259 /* wrong return value */
4260 PyErr_SetString(PyExc_TypeError,
4261 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004262 Py_DECREF(x);
4263 goto onError;
4264 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004265 Py_DECREF(x);
4266 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004267 continue;
4268Undefined:
4269 /* undefined mapping */
4270 Py_XDECREF(x);
4271 outpos = p-PyUnicode_AS_UNICODE(v);
4272 startinpos = s-starts;
4273 endinpos = startinpos+1;
4274 if (unicode_decode_call_errorhandler(
4275 errors, &errorHandler,
4276 "charmap", "character maps to <undefined>",
4277 starts, size, &startinpos, &endinpos, &exc, &s,
4278 &v, &outpos, &p)) {
4279 goto onError;
4280 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004281 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282 }
4283 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004284 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4285 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 Py_XDECREF(errorHandler);
4287 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004289
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004290 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 Py_XDECREF(errorHandler);
4292 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293 Py_XDECREF(v);
4294 return NULL;
4295}
4296
Martin v. Löwis3f767792006-06-04 19:36:28 +00004297/* Charmap encoding: the lookup table */
4298
4299struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004300 PyObject_HEAD
4301 unsigned char level1[32];
4302 int count2, count3;
4303 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004304};
4305
4306static PyObject*
4307encoding_map_size(PyObject *obj, PyObject* args)
4308{
4309 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004310 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004311 128*map->count3);
4312}
4313
4314static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004315 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004316 PyDoc_STR("Return the size (in bytes) of this object") },
4317 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004318};
4319
4320static void
4321encoding_map_dealloc(PyObject* o)
4322{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004323 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004324}
4325
4326static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004327 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004328 "EncodingMap", /*tp_name*/
4329 sizeof(struct encoding_map), /*tp_basicsize*/
4330 0, /*tp_itemsize*/
4331 /* methods */
4332 encoding_map_dealloc, /*tp_dealloc*/
4333 0, /*tp_print*/
4334 0, /*tp_getattr*/
4335 0, /*tp_setattr*/
4336 0, /*tp_compare*/
4337 0, /*tp_repr*/
4338 0, /*tp_as_number*/
4339 0, /*tp_as_sequence*/
4340 0, /*tp_as_mapping*/
4341 0, /*tp_hash*/
4342 0, /*tp_call*/
4343 0, /*tp_str*/
4344 0, /*tp_getattro*/
4345 0, /*tp_setattro*/
4346 0, /*tp_as_buffer*/
4347 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4348 0, /*tp_doc*/
4349 0, /*tp_traverse*/
4350 0, /*tp_clear*/
4351 0, /*tp_richcompare*/
4352 0, /*tp_weaklistoffset*/
4353 0, /*tp_iter*/
4354 0, /*tp_iternext*/
4355 encoding_map_methods, /*tp_methods*/
4356 0, /*tp_members*/
4357 0, /*tp_getset*/
4358 0, /*tp_base*/
4359 0, /*tp_dict*/
4360 0, /*tp_descr_get*/
4361 0, /*tp_descr_set*/
4362 0, /*tp_dictoffset*/
4363 0, /*tp_init*/
4364 0, /*tp_alloc*/
4365 0, /*tp_new*/
4366 0, /*tp_free*/
4367 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004368};
4369
4370PyObject*
4371PyUnicode_BuildEncodingMap(PyObject* string)
4372{
4373 Py_UNICODE *decode;
4374 PyObject *result;
4375 struct encoding_map *mresult;
4376 int i;
4377 int need_dict = 0;
4378 unsigned char level1[32];
4379 unsigned char level2[512];
4380 unsigned char *mlevel1, *mlevel2, *mlevel3;
4381 int count2 = 0, count3 = 0;
4382
4383 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4384 PyErr_BadArgument();
4385 return NULL;
4386 }
4387 decode = PyUnicode_AS_UNICODE(string);
4388 memset(level1, 0xFF, sizeof level1);
4389 memset(level2, 0xFF, sizeof level2);
4390
4391 /* If there isn't a one-to-one mapping of NULL to \0,
4392 or if there are non-BMP characters, we need to use
4393 a mapping dictionary. */
4394 if (decode[0] != 0)
4395 need_dict = 1;
4396 for (i = 1; i < 256; i++) {
4397 int l1, l2;
4398 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004399#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004400 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004401#endif
4402 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004403 need_dict = 1;
4404 break;
4405 }
4406 if (decode[i] == 0xFFFE)
4407 /* unmapped character */
4408 continue;
4409 l1 = decode[i] >> 11;
4410 l2 = decode[i] >> 7;
4411 if (level1[l1] == 0xFF)
4412 level1[l1] = count2++;
4413 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004414 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004415 }
4416
4417 if (count2 >= 0xFF || count3 >= 0xFF)
4418 need_dict = 1;
4419
4420 if (need_dict) {
4421 PyObject *result = PyDict_New();
4422 PyObject *key, *value;
4423 if (!result)
4424 return NULL;
4425 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004426 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004427 key = PyInt_FromLong(decode[i]);
4428 value = PyInt_FromLong(i);
4429 if (!key || !value)
4430 goto failed1;
4431 if (PyDict_SetItem(result, key, value) == -1)
4432 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004433 Py_DECREF(key);
4434 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004435 }
4436 return result;
4437 failed1:
4438 Py_XDECREF(key);
4439 Py_XDECREF(value);
4440 Py_DECREF(result);
4441 return NULL;
4442 }
4443
4444 /* Create a three-level trie */
4445 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4446 16*count2 + 128*count3 - 1);
4447 if (!result)
4448 return PyErr_NoMemory();
4449 PyObject_Init(result, &EncodingMapType);
4450 mresult = (struct encoding_map*)result;
4451 mresult->count2 = count2;
4452 mresult->count3 = count3;
4453 mlevel1 = mresult->level1;
4454 mlevel2 = mresult->level23;
4455 mlevel3 = mresult->level23 + 16*count2;
4456 memcpy(mlevel1, level1, 32);
4457 memset(mlevel2, 0xFF, 16*count2);
4458 memset(mlevel3, 0, 128*count3);
4459 count3 = 0;
4460 for (i = 1; i < 256; i++) {
4461 int o1, o2, o3, i2, i3;
4462 if (decode[i] == 0xFFFE)
4463 /* unmapped character */
4464 continue;
4465 o1 = decode[i]>>11;
4466 o2 = (decode[i]>>7) & 0xF;
4467 i2 = 16*mlevel1[o1] + o2;
4468 if (mlevel2[i2] == 0xFF)
4469 mlevel2[i2] = count3++;
4470 o3 = decode[i] & 0x7F;
4471 i3 = 128*mlevel2[i2] + o3;
4472 mlevel3[i3] = i;
4473 }
4474 return result;
4475}
4476
4477static int
4478encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4479{
4480 struct encoding_map *map = (struct encoding_map*)mapping;
4481 int l1 = c>>11;
4482 int l2 = (c>>7) & 0xF;
4483 int l3 = c & 0x7F;
4484 int i;
4485
4486#ifdef Py_UNICODE_WIDE
4487 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004488 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004489 }
4490#endif
4491 if (c == 0)
4492 return 0;
4493 /* level 1*/
4494 i = map->level1[l1];
4495 if (i == 0xFF) {
4496 return -1;
4497 }
4498 /* level 2*/
4499 i = map->level23[16*i+l2];
4500 if (i == 0xFF) {
4501 return -1;
4502 }
4503 /* level 3 */
4504 i = map->level23[16*map->count2 + 128*i + l3];
4505 if (i == 0) {
4506 return -1;
4507 }
4508 return i;
4509}
4510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511/* Lookup the character ch in the mapping. If the character
4512 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004513 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 PyObject *w = PyInt_FromLong((long)c);
4517 PyObject *x;
4518
4519 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004520 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 x = PyObject_GetItem(mapping, w);
4522 Py_DECREF(w);
4523 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004524 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4525 /* No mapping found means: mapping is undefined. */
4526 PyErr_Clear();
4527 x = Py_None;
4528 Py_INCREF(x);
4529 return x;
4530 } else
4531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004533 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004534 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004536 long value = PyInt_AS_LONG(x);
4537 if (value < 0 || value > 255) {
4538 PyErr_SetString(PyExc_TypeError,
4539 "character mapping must be in range(256)");
4540 Py_DECREF(x);
4541 return NULL;
4542 }
4543 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004544 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004545 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004546 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004548 /* wrong return value */
4549 PyErr_SetString(PyExc_TypeError,
4550 "character mapping must return integer, None or str");
4551 Py_DECREF(x);
4552 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553 }
4554}
4555
Martin v. Löwis3f767792006-06-04 19:36:28 +00004556static int
4557charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4558{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004559 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4560 /* exponentially overallocate to minimize reallocations */
4561 if (requiredsize < 2*outsize)
4562 requiredsize = 2*outsize;
4563 if (_PyString_Resize(outobj, requiredsize)) {
4564 return 0;
4565 }
4566 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004567}
4568
Benjamin Peterson857ce152009-01-31 16:29:18 +00004569typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004570 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004571}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572/* lookup the character, put the result in the output string and adjust
4573 various state variables. Reallocate the output string if not enough
4574 space is available. Return a new reference to the object that
4575 was put in the output buffer, or Py_None, if the mapping was undefined
4576 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004577 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004579charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004580 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004582 PyObject *rep;
4583 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004584 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585
Christian Heimese93237d2007-12-19 02:37:44 +00004586 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004587 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004588 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004589 if (res == -1)
4590 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004591 if (outsize<requiredsize)
4592 if (!charmapencode_resize(outobj, outpos, requiredsize))
4593 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004594 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004595 outstart[(*outpos)++] = (char)res;
4596 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004597 }
4598
4599 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004601 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004602 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004603 Py_DECREF(rep);
4604 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004605 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004606 if (PyInt_Check(rep)) {
4607 Py_ssize_t requiredsize = *outpos+1;
4608 if (outsize<requiredsize)
4609 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4610 Py_DECREF(rep);
4611 return enc_EXCEPTION;
4612 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004613 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004614 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004615 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004616 else {
4617 const char *repchars = PyString_AS_STRING(rep);
4618 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4619 Py_ssize_t requiredsize = *outpos+repsize;
4620 if (outsize<requiredsize)
4621 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4622 Py_DECREF(rep);
4623 return enc_EXCEPTION;
4624 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004625 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004626 memcpy(outstart + *outpos, repchars, repsize);
4627 *outpos += repsize;
4628 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 }
Georg Brandl9f167602006-06-04 21:46:16 +00004630 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004631 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004632}
4633
4634/* handle an error in PyUnicode_EncodeCharmap
4635 Return 0 on success, -1 on error */
4636static
4637int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004640 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004641 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642{
4643 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004644 Py_ssize_t repsize;
4645 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 Py_UNICODE *uni2;
4647 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004648 Py_ssize_t collstartpos = *inpos;
4649 Py_ssize_t collendpos = *inpos+1;
4650 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 char *encoding = "charmap";
4652 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004653 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 /* find all unencodable characters */
4656 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004657 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004658 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004659 int res = encoding_map_lookup(p[collendpos], mapping);
4660 if (res != -1)
4661 break;
4662 ++collendpos;
4663 continue;
4664 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004665
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004666 rep = charmapencode_lookup(p[collendpos], mapping);
4667 if (rep==NULL)
4668 return -1;
4669 else if (rep!=Py_None) {
4670 Py_DECREF(rep);
4671 break;
4672 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004673 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004674 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675 }
4676 /* cache callback name lookup
4677 * (if not done yet, i.e. it's the first error) */
4678 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004679 if ((errors==NULL) || (!strcmp(errors, "strict")))
4680 *known_errorHandler = 1;
4681 else if (!strcmp(errors, "replace"))
4682 *known_errorHandler = 2;
4683 else if (!strcmp(errors, "ignore"))
4684 *known_errorHandler = 3;
4685 else if (!strcmp(errors, "xmlcharrefreplace"))
4686 *known_errorHandler = 4;
4687 else
4688 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 }
4690 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004691 case 1: /* strict */
4692 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4693 return -1;
4694 case 2: /* replace */
4695 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004696 x = charmapencode_output('?', mapping, res, respos);
4697 if (x==enc_EXCEPTION) {
4698 return -1;
4699 }
4700 else if (x==enc_FAILED) {
4701 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4702 return -1;
4703 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004704 }
4705 /* fall through */
4706 case 3: /* ignore */
4707 *inpos = collendpos;
4708 break;
4709 case 4: /* xmlcharrefreplace */
Serhiy Storchakae822b032013-08-06 16:56:26 +03004710 /* generate replacement */
4711 for (collpos = collstartpos; collpos < collendpos;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004712 char buffer[2+29+1+1];
4713 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03004714 Py_UCS4 ch = p[collpos++];
4715#ifndef Py_UNICODE_WIDE
4716 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4717 (collpos < collendpos) &&
4718 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4719 ch = ((((ch & 0x03FF) << 10) |
4720 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4721 }
4722#endif
4723 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004724 for (cp = buffer; *cp; ++cp) {
4725 x = charmapencode_output(*cp, mapping, res, respos);
4726 if (x==enc_EXCEPTION)
4727 return -1;
4728 else if (x==enc_FAILED) {
4729 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4730 return -1;
4731 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004732 }
4733 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004734 *inpos = collendpos;
4735 break;
4736 default:
4737 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004738 encoding, reason, p, size, exceptionObject,
4739 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004740 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004741 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004742 /* generate replacement */
4743 repsize = PyUnicode_GET_SIZE(repunicode);
4744 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004745 x = charmapencode_output(*uni2, mapping, res, respos);
4746 if (x==enc_EXCEPTION) {
4747 return -1;
4748 }
4749 else if (x==enc_FAILED) {
4750 Py_DECREF(repunicode);
4751 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4752 return -1;
4753 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004754 }
4755 *inpos = newpos;
4756 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757 }
4758 return 0;
4759}
4760
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004762 Py_ssize_t size,
4763 PyObject *mapping,
4764 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 /* output object */
4767 PyObject *res = NULL;
4768 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004769 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004771 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772 PyObject *errorHandler = NULL;
4773 PyObject *exc = NULL;
4774 /* the following variable is used for caching string comparisons
4775 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4776 * 3=ignore, 4=xmlcharrefreplace */
4777 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778
4779 /* Default to Latin-1 */
4780 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004781 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004783 /* allocate enough for a simple encoding without
4784 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004785 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786 if (res == NULL)
4787 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004788 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004789 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004792 /* try to encode it */
4793 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4794 if (x==enc_EXCEPTION) /* error */
4795 goto onError;
4796 if (x==enc_FAILED) { /* unencodable character */
4797 if (charmap_encoding_error(p, size, &inpos, mapping,
4798 &exc,
4799 &known_errorHandler, &errorHandler, errors,
4800 &res, &respos)) {
4801 goto onError;
4802 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004803 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004804 else
4805 /* done with this character => adjust input position */
4806 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004810 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004811 if (_PyString_Resize(&res, respos))
4812 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 }
4814 Py_XDECREF(exc);
4815 Py_XDECREF(errorHandler);
4816 return res;
4817
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004818 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004819 Py_XDECREF(res);
4820 Py_XDECREF(exc);
4821 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 return NULL;
4823}
4824
4825PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004826 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827{
4828 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004829 PyErr_BadArgument();
4830 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
4832 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004833 PyUnicode_GET_SIZE(unicode),
4834 mapping,
4835 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836}
4837
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004838/* create or adjust a UnicodeTranslateError */
4839static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004840 const Py_UNICODE *unicode, Py_ssize_t size,
4841 Py_ssize_t startpos, Py_ssize_t endpos,
4842 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004845 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004846 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 }
4848 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004849 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4850 goto onError;
4851 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4852 goto onError;
4853 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4854 goto onError;
4855 return;
4856 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02004857 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858 }
4859}
4860
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861/* raises a UnicodeTranslateError */
4862static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004863 const Py_UNICODE *unicode, Py_ssize_t size,
4864 Py_ssize_t startpos, Py_ssize_t endpos,
4865 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866{
4867 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004868 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004870 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871}
4872
4873/* error handling callback helper:
4874 build arguments, call the callback and check the arguments,
4875 put the result into newpos and return the replacement string, which
4876 has to be freed by the caller */
4877static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004878 PyObject **errorHandler,
4879 const char *reason,
4880 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4881 Py_ssize_t startpos, Py_ssize_t endpos,
4882 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004884 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885
Martin v. Löwis412fb672006-04-13 06:34:32 +00004886 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 PyObject *restuple;
4888 PyObject *resunicode;
4889
4890 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004891 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004893 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004894 }
4895
4896 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004897 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004899 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900
4901 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004902 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004904 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004906 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004907 Py_DECREF(restuple);
4908 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004909 }
4910 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004911 &resunicode, &i_newpos)) {
4912 Py_DECREF(restuple);
4913 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004915 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004916 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004917 else
4918 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004919 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004920 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4921 Py_DECREF(restuple);
4922 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004923 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004924 Py_INCREF(resunicode);
4925 Py_DECREF(restuple);
4926 return resunicode;
4927}
4928
4929/* Lookup the character ch in the mapping and put the result in result,
4930 which must be decrefed by the caller.
4931 Return 0 on success, -1 on error */
4932static
4933int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4934{
4935 PyObject *w = PyInt_FromLong((long)c);
4936 PyObject *x;
4937
4938 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004939 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004940 x = PyObject_GetItem(mapping, w);
4941 Py_DECREF(w);
4942 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004943 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4944 /* No mapping found means: use 1:1 mapping. */
4945 PyErr_Clear();
4946 *result = NULL;
4947 return 0;
4948 } else
4949 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004950 }
4951 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004952 *result = x;
4953 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004954 }
4955 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004956 long value = PyInt_AS_LONG(x);
4957 long max = PyUnicode_GetMax();
4958 if (value < 0 || value > max) {
4959 PyErr_Format(PyExc_TypeError,
4960 "character mapping must be in range(0x%lx)", max+1);
4961 Py_DECREF(x);
4962 return -1;
4963 }
4964 *result = x;
4965 return 0;
4966 }
4967 else if (PyUnicode_Check(x)) {
4968 *result = x;
4969 return 0;
4970 }
4971 else {
4972 /* wrong return value */
4973 PyErr_SetString(PyExc_TypeError,
4974 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004975 Py_DECREF(x);
4976 return -1;
4977 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978}
4979/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004980 if not reallocate and adjust various state variables.
4981 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004982static
Walter Dörwald4894c302003-10-24 14:25:28 +00004983int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004984 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004985{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004986 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004987 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004988 /* remember old output position */
4989 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4990 /* exponentially overallocate to minimize reallocations */
4991 if (requiredsize < 2 * oldsize)
4992 requiredsize = 2 * oldsize;
4993 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4994 return -1;
4995 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004996 }
4997 return 0;
4998}
4999/* lookup the character, put the result in the output string and adjust
5000 various state variables. Return a new reference to the object that
5001 was put in the output buffer in *result, or Py_None, if the mapping was
5002 undefined (in which case no character was written).
5003 The called must decref result.
5004 Return 0 on success, -1 on error. */
5005static
Walter Dörwald4894c302003-10-24 14:25:28 +00005006int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005007 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5008 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005009{
Walter Dörwald4894c302003-10-24 14:25:28 +00005010 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005011 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005012 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005013 /* not found => default to 1:1 mapping */
5014 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005015 }
5016 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005017 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005018 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005019 /* no overflow check, because we know that the space is enough */
5020 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005021 }
5022 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005023 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5024 if (repsize==1) {
5025 /* no overflow check, because we know that the space is enough */
5026 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5027 }
5028 else if (repsize!=0) {
5029 /* more than one character */
5030 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5031 (insize - (curinp-startinp)) +
5032 repsize - 1;
5033 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5034 return -1;
5035 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5036 *outp += repsize;
5037 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005038 }
5039 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005040 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041 return 0;
5042}
5043
5044PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005045 Py_ssize_t size,
5046 PyObject *mapping,
5047 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005049 /* output object */
5050 PyObject *res = NULL;
5051 /* pointers to the beginning and end+1 of input */
5052 const Py_UNICODE *startp = p;
5053 const Py_UNICODE *endp = p + size;
5054 /* pointer into the output */
5055 Py_UNICODE *str;
5056 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005057 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058 char *reason = "character maps to <undefined>";
5059 PyObject *errorHandler = NULL;
5060 PyObject *exc = NULL;
5061 /* the following variable is used for caching string comparisons
5062 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5063 * 3=ignore, 4=xmlcharrefreplace */
5064 int known_errorHandler = -1;
5065
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005067 PyErr_BadArgument();
5068 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005070
5071 /* allocate enough for a simple 1:1 translation without
5072 replacements, if we need more, we'll resize */
5073 res = PyUnicode_FromUnicode(NULL, size);
5074 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005075 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005077 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005078 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005080 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005081 /* try to encode it */
5082 PyObject *x = NULL;
5083 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5084 Py_XDECREF(x);
5085 goto onError;
5086 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005087 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005088 if (x!=Py_None) /* it worked => adjust input pointer */
5089 ++p;
5090 else { /* untranslatable character */
5091 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5092 Py_ssize_t repsize;
5093 Py_ssize_t newpos;
5094 Py_UNICODE *uni2;
5095 /* startpos for collecting untranslatable chars */
5096 const Py_UNICODE *collstart = p;
5097 const Py_UNICODE *collend = p+1;
5098 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005100 /* find all untranslatable characters */
5101 while (collend < endp) {
5102 if (charmaptranslate_lookup(*collend, mapping, &x))
5103 goto onError;
5104 Py_XDECREF(x);
5105 if (x!=Py_None)
5106 break;
5107 ++collend;
5108 }
5109 /* cache callback name lookup
5110 * (if not done yet, i.e. it's the first error) */
5111 if (known_errorHandler==-1) {
5112 if ((errors==NULL) || (!strcmp(errors, "strict")))
5113 known_errorHandler = 1;
5114 else if (!strcmp(errors, "replace"))
5115 known_errorHandler = 2;
5116 else if (!strcmp(errors, "ignore"))
5117 known_errorHandler = 3;
5118 else if (!strcmp(errors, "xmlcharrefreplace"))
5119 known_errorHandler = 4;
5120 else
5121 known_errorHandler = 0;
5122 }
5123 switch (known_errorHandler) {
5124 case 1: /* strict */
5125 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005126 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005127 case 2: /* replace */
5128 /* No need to check for space, this is a 1:1 replacement */
5129 for (coll = collstart; coll<collend; ++coll)
5130 *str++ = '?';
5131 /* fall through */
5132 case 3: /* ignore */
5133 p = collend;
5134 break;
5135 case 4: /* xmlcharrefreplace */
5136 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005137 for (p = collstart; p < collend;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005138 char buffer[2+29+1+1];
5139 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03005140 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5141 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005142 if (charmaptranslate_makespace(&res, &str,
5143 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5144 goto onError;
5145 for (cp = buffer; *cp; ++cp)
5146 *str++ = *cp;
5147 }
5148 p = collend;
5149 break;
5150 default:
5151 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5152 reason, startp, size, &exc,
5153 collstart-startp, collend-startp, &newpos);
5154 if (repunicode == NULL)
5155 goto onError;
5156 /* generate replacement */
5157 repsize = PyUnicode_GET_SIZE(repunicode);
5158 if (charmaptranslate_makespace(&res, &str,
5159 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5160 Py_DECREF(repunicode);
5161 goto onError;
5162 }
5163 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5164 *str++ = *uni2;
5165 p = startp + newpos;
5166 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005167 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005168 }
5169 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005170 /* Resize if we allocated to much */
5171 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005172 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005173 if (PyUnicode_Resize(&res, respos) < 0)
5174 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005175 }
5176 Py_XDECREF(exc);
5177 Py_XDECREF(errorHandler);
5178 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005180 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005181 Py_XDECREF(res);
5182 Py_XDECREF(exc);
5183 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 return NULL;
5185}
5186
5187PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005188 PyObject *mapping,
5189 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190{
5191 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005192
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 str = PyUnicode_FromObject(str);
5194 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005197 PyUnicode_GET_SIZE(str),
5198 mapping,
5199 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 Py_DECREF(str);
5201 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005202
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005203 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 Py_XDECREF(str);
5205 return NULL;
5206}
Tim Petersced69f82003-09-16 20:30:58 +00005207
Guido van Rossum9e896b32000-04-05 20:11:21 +00005208/* --- Decimal Encoder ---------------------------------------------------- */
5209
5210int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005211 Py_ssize_t length,
5212 char *output,
5213 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005214{
5215 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005216 PyObject *errorHandler = NULL;
5217 PyObject *exc = NULL;
5218 const char *encoding = "decimal";
5219 const char *reason = "invalid decimal Unicode string";
5220 /* the following variable is used for caching string comparisons
5221 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5222 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005223
5224 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005225 PyErr_BadArgument();
5226 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005227 }
5228
5229 p = s;
5230 end = s + length;
5231 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005232 register Py_UNICODE ch = *p;
5233 int decimal;
5234 PyObject *repunicode;
5235 Py_ssize_t repsize;
5236 Py_ssize_t newpos;
5237 Py_UNICODE *uni2;
5238 Py_UNICODE *collstart;
5239 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005241 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005242 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005243 ++p;
5244 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005245 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005246 decimal = Py_UNICODE_TODECIMAL(ch);
5247 if (decimal >= 0) {
5248 *output++ = '0' + decimal;
5249 ++p;
5250 continue;
5251 }
5252 if (0 < ch && ch < 256) {
5253 *output++ = (char)ch;
5254 ++p;
5255 continue;
5256 }
5257 /* All other characters are considered unencodable */
5258 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005259 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005260 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005261 Py_UNICODE_ISSPACE(*collend) ||
5262 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005263 break;
5264 }
5265 /* cache callback name lookup
5266 * (if not done yet, i.e. it's the first error) */
5267 if (known_errorHandler==-1) {
5268 if ((errors==NULL) || (!strcmp(errors, "strict")))
5269 known_errorHandler = 1;
5270 else if (!strcmp(errors, "replace"))
5271 known_errorHandler = 2;
5272 else if (!strcmp(errors, "ignore"))
5273 known_errorHandler = 3;
5274 else if (!strcmp(errors, "xmlcharrefreplace"))
5275 known_errorHandler = 4;
5276 else
5277 known_errorHandler = 0;
5278 }
5279 switch (known_errorHandler) {
5280 case 1: /* strict */
5281 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5282 goto onError;
5283 case 2: /* replace */
5284 for (p = collstart; p < collend; ++p)
5285 *output++ = '?';
5286 /* fall through */
5287 case 3: /* ignore */
5288 p = collend;
5289 break;
5290 case 4: /* xmlcharrefreplace */
5291 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005292 for (p = collstart; p < collend;) {
5293 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5294 output += sprintf(output, "&#%d;", ch);
5295 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005296 p = collend;
5297 break;
5298 default:
5299 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5300 encoding, reason, s, length, &exc,
5301 collstart-s, collend-s, &newpos);
5302 if (repunicode == NULL)
5303 goto onError;
5304 /* generate replacement */
5305 repsize = PyUnicode_GET_SIZE(repunicode);
5306 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5307 Py_UNICODE ch = *uni2;
5308 if (Py_UNICODE_ISSPACE(ch))
5309 *output++ = ' ';
5310 else {
5311 decimal = Py_UNICODE_TODECIMAL(ch);
5312 if (decimal >= 0)
5313 *output++ = '0' + decimal;
5314 else if (0 < ch && ch < 256)
5315 *output++ = (char)ch;
5316 else {
5317 Py_DECREF(repunicode);
5318 raise_encode_exception(&exc, encoding,
5319 s, length, collstart-s, collend-s, reason);
5320 goto onError;
5321 }
5322 }
5323 }
5324 p = s + newpos;
5325 Py_DECREF(repunicode);
5326 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005327 }
5328 /* 0-terminate the output string */
5329 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005330 Py_XDECREF(exc);
5331 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005332 return 0;
5333
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005334 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005335 Py_XDECREF(exc);
5336 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005337 return -1;
5338}
5339
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340/* --- Helpers ------------------------------------------------------------ */
5341
Eric Smitha9f7d622008-02-17 19:46:49 +00005342#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005343#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005344
5345#include "stringlib/count.h"
5346#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005347#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005348#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005349
Fredrik Lundhc8162812006-05-26 19:33:03 +00005350/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005351#define ADJUST_INDICES(start, end, len) \
5352 if (end > len) \
5353 end = len; \
5354 else if (end < 0) { \
5355 end += len; \
5356 if (end < 0) \
5357 end = 0; \
5358 } \
5359 if (start < 0) { \
5360 start += len; \
5361 if (start < 0) \
5362 start = 0; \
5363 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005364
Martin v. Löwis18e16552006-02-15 17:27:45 +00005365Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005366 PyObject *substr,
5367 Py_ssize_t start,
5368 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005370 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005371 PyUnicodeObject* str_obj;
5372 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005373
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005374 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5375 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005376 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005377 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5378 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005379 Py_DECREF(str_obj);
5380 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 }
Tim Petersced69f82003-09-16 20:30:58 +00005382
Antoine Pitrou64672132010-01-13 07:55:48 +00005383 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005384 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005385 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5386 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005387 );
5388
5389 Py_DECREF(sub_obj);
5390 Py_DECREF(str_obj);
5391
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 return result;
5393}
5394
Martin v. Löwis18e16552006-02-15 17:27:45 +00005395Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005396 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005397 Py_ssize_t start,
5398 Py_ssize_t end,
5399 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005401 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005402
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005403 str = PyUnicode_FromObject(str);
5404 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005405 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005406 sub = PyUnicode_FromObject(sub);
5407 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005408 Py_DECREF(str);
5409 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 }
Tim Petersced69f82003-09-16 20:30:58 +00005411
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005412 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005413 result = stringlib_find_slice(
5414 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5415 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5416 start, end
5417 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005418 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005419 result = stringlib_rfind_slice(
5420 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5421 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5422 start, end
5423 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005424
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005425 Py_DECREF(str);
5426 Py_DECREF(sub);
5427
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 return result;
5429}
5430
Tim Petersced69f82003-09-16 20:30:58 +00005431static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005433 PyUnicodeObject *substring,
5434 Py_ssize_t start,
5435 Py_ssize_t end,
5436 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 if (substring->length == 0)
5439 return 1;
5440
Antoine Pitrou64672132010-01-13 07:55:48 +00005441 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 end -= substring->length;
5443 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005444 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445
5446 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005447 if (Py_UNICODE_MATCH(self, end, substring))
5448 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 } else {
5450 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005451 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 }
5453
5454 return 0;
5455}
5456
Martin v. Löwis18e16552006-02-15 17:27:45 +00005457Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005458 PyObject *substr,
5459 Py_ssize_t start,
5460 Py_ssize_t end,
5461 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005463 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005464
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465 str = PyUnicode_FromObject(str);
5466 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005467 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 substr = PyUnicode_FromObject(substr);
5469 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005470 Py_DECREF(str);
5471 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 }
Tim Petersced69f82003-09-16 20:30:58 +00005473
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005475 (PyUnicodeObject *)substr,
5476 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 Py_DECREF(str);
5478 Py_DECREF(substr);
5479 return result;
5480}
5481
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482/* Apply fixfct filter to the Unicode object self and return a
5483 reference to the modified object */
5484
Tim Petersced69f82003-09-16 20:30:58 +00005485static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005487 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488{
5489
5490 PyUnicodeObject *u;
5491
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005492 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005494 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005495
5496 Py_UNICODE_COPY(u->str, self->str, self->length);
5497
Tim Peters7a29bd52001-09-12 03:03:31 +00005498 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005499 /* fixfct should return TRUE if it modified the buffer. If
5500 FALSE, return a reference to the original buffer instead
5501 (to save space, not time) */
5502 Py_INCREF(self);
5503 Py_DECREF(u);
5504 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 }
5506 return (PyObject*) u;
5507}
5508
Tim Petersced69f82003-09-16 20:30:58 +00005509static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510int fixupper(PyUnicodeObject *self)
5511{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005512 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 Py_UNICODE *s = self->str;
5514 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005515
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005517 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005518
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005519 ch = Py_UNICODE_TOUPPER(*s);
5520 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005522 *s = ch;
5523 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 s++;
5525 }
5526
5527 return status;
5528}
5529
Tim Petersced69f82003-09-16 20:30:58 +00005530static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531int fixlower(PyUnicodeObject *self)
5532{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005533 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 Py_UNICODE *s = self->str;
5535 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005536
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005538 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005539
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005540 ch = Py_UNICODE_TOLOWER(*s);
5541 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005543 *s = ch;
5544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 s++;
5546 }
5547
5548 return status;
5549}
5550
Tim Petersced69f82003-09-16 20:30:58 +00005551static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552int fixswapcase(PyUnicodeObject *self)
5553{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005554 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 Py_UNICODE *s = self->str;
5556 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005557
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 while (len-- > 0) {
5559 if (Py_UNICODE_ISUPPER(*s)) {
5560 *s = Py_UNICODE_TOLOWER(*s);
5561 status = 1;
5562 } else if (Py_UNICODE_ISLOWER(*s)) {
5563 *s = Py_UNICODE_TOUPPER(*s);
5564 status = 1;
5565 }
5566 s++;
5567 }
5568
5569 return status;
5570}
5571
Tim Petersced69f82003-09-16 20:30:58 +00005572static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573int fixcapitalize(PyUnicodeObject *self)
5574{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005575 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005576 Py_UNICODE *s = self->str;
5577 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005578
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005579 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005580 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005581 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005582 *s = Py_UNICODE_TOUPPER(*s);
5583 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005585 s++;
5586 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005587 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005588 *s = Py_UNICODE_TOLOWER(*s);
5589 status = 1;
5590 }
5591 s++;
5592 }
5593 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594}
5595
5596static
5597int fixtitle(PyUnicodeObject *self)
5598{
5599 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5600 register Py_UNICODE *e;
5601 int previous_is_cased;
5602
5603 /* Shortcut for single character strings */
5604 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005605 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5606 if (*p != ch) {
5607 *p = ch;
5608 return 1;
5609 }
5610 else
5611 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 }
Tim Petersced69f82003-09-16 20:30:58 +00005613
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 e = p + PyUnicode_GET_SIZE(self);
5615 previous_is_cased = 0;
5616 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005617 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005618
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005619 if (previous_is_cased)
5620 *p = Py_UNICODE_TOLOWER(ch);
5621 else
5622 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005623
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005624 if (Py_UNICODE_ISLOWER(ch) ||
5625 Py_UNICODE_ISUPPER(ch) ||
5626 Py_UNICODE_ISTITLE(ch))
5627 previous_is_cased = 1;
5628 else
5629 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 }
5631 return 1;
5632}
5633
Tim Peters8ce9f162004-08-27 01:49:32 +00005634PyObject *
5635PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636{
Tim Peters8ce9f162004-08-27 01:49:32 +00005637 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005638 const Py_UNICODE blank = ' ';
5639 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005640 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005641 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005642 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5643 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005644 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5645 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005646 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005647 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005648 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649
Benjamin Peterson7a91bf82014-02-15 13:02:52 -05005650 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00005651 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005652 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005653 }
5654
Tim Peters91879ab2004-08-27 22:35:44 +00005655 /* Grrrr. A codec may be invoked to convert str objects to
5656 * Unicode, and so it's possible to call back into Python code
5657 * during PyUnicode_FromObject(), and so it's possible for a sick
5658 * codec to change the size of fseq (if seq is a list). Therefore
5659 * we have to keep refetching the size -- can't assume seqlen
5660 * is invariant.
5661 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005662 seqlen = PySequence_Fast_GET_SIZE(fseq);
5663 /* If empty sequence, return u"". */
5664 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005665 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5666 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005667 }
5668 /* If singleton sequence with an exact Unicode, return that. */
5669 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005670 item = PySequence_Fast_GET_ITEM(fseq, 0);
5671 if (PyUnicode_CheckExact(item)) {
5672 Py_INCREF(item);
5673 res = (PyUnicodeObject *)item;
5674 goto Done;
5675 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005676 }
5677
Tim Peters05eba1f2004-08-27 21:32:02 +00005678 /* At least two items to join, or one that isn't exact Unicode. */
5679 if (seqlen > 1) {
5680 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005681 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005682 sep = &blank;
5683 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005684 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005685 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005686 internal_separator = PyUnicode_FromObject(separator);
5687 if (internal_separator == NULL)
5688 goto onError;
5689 sep = PyUnicode_AS_UNICODE(internal_separator);
5690 seplen = PyUnicode_GET_SIZE(internal_separator);
5691 /* In case PyUnicode_FromObject() mutated seq. */
5692 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005693 }
5694 }
5695
5696 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005697 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005698 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005699 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005700 res_p = PyUnicode_AS_UNICODE(res);
5701 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005702
Tim Peters05eba1f2004-08-27 21:32:02 +00005703 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005704 Py_ssize_t itemlen;
5705 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005706
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005707 item = PySequence_Fast_GET_ITEM(fseq, i);
5708 /* Convert item to Unicode. */
5709 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5710 PyErr_Format(PyExc_TypeError,
5711 "sequence item %zd: expected string or Unicode,"
5712 " %.80s found",
5713 i, Py_TYPE(item)->tp_name);
5714 goto onError;
5715 }
5716 item = PyUnicode_FromObject(item);
5717 if (item == NULL)
5718 goto onError;
5719 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005720
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005721 /* In case PyUnicode_FromObject() mutated seq. */
5722 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005723
Tim Peters8ce9f162004-08-27 01:49:32 +00005724 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005725 itemlen = PyUnicode_GET_SIZE(item);
5726 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005727 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005728 goto Overflow;
5729 if (i < seqlen - 1) {
5730 new_res_used += seplen;
5731 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005732 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005733 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005734 if (new_res_used > res_alloc) {
5735 /* double allocated size until it's big enough */
5736 do {
5737 res_alloc += res_alloc;
5738 if (res_alloc <= 0)
5739 goto Overflow;
5740 } while (new_res_used > res_alloc);
5741 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5742 Py_DECREF(item);
5743 goto onError;
5744 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005745 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005746 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005747
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005748 /* Copy item, and maybe the separator. */
5749 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5750 res_p += itemlen;
5751 if (i < seqlen - 1) {
5752 Py_UNICODE_COPY(res_p, sep, seplen);
5753 res_p += seplen;
5754 }
5755 Py_DECREF(item);
5756 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005757 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005758
Tim Peters05eba1f2004-08-27 21:32:02 +00005759 /* Shrink res to match the used area; this probably can't fail,
5760 * but it's cheap to check.
5761 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005762 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005763 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005764
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005765 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005766 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005767 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 return (PyObject *)res;
5769
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005770 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005771 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005772 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005773 Py_DECREF(item);
5774 /* fall through */
5775
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005776 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005777 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005778 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005779 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780 return NULL;
5781}
5782
Tim Petersced69f82003-09-16 20:30:58 +00005783static
5784PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005785 Py_ssize_t left,
5786 Py_ssize_t right,
5787 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788{
5789 PyUnicodeObject *u;
5790
5791 if (left < 0)
5792 left = 0;
5793 if (right < 0)
5794 right = 0;
5795
Tim Peters7a29bd52001-09-12 03:03:31 +00005796 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 Py_INCREF(self);
5798 return self;
5799 }
5800
Neal Norwitze7d8be82008-07-31 17:17:14 +00005801 if (left > PY_SSIZE_T_MAX - self->length ||
5802 right > PY_SSIZE_T_MAX - (left + self->length)) {
5803 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5804 return NULL;
5805 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 u = _PyUnicode_New(left + self->length + right);
5807 if (u) {
5808 if (left)
5809 Py_UNICODE_FILL(u->str, fill, left);
5810 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5811 if (right)
5812 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5813 }
5814
5815 return u;
5816}
5817
Antoine Pitrou64672132010-01-13 07:55:48 +00005818PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821
5822 string = PyUnicode_FromObject(string);
5823 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
Antoine Pitrou64672132010-01-13 07:55:48 +00005826 list = stringlib_splitlines(
5827 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5828 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829
5830 Py_DECREF(string);
5831 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832}
5833
Tim Petersced69f82003-09-16 20:30:58 +00005834static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005836 PyUnicodeObject *substring,
5837 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005840 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005843 return stringlib_split_whitespace(
5844 (PyObject*) self, self->str, self->length, maxcount
5845 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846
Antoine Pitrou64672132010-01-13 07:55:48 +00005847 return stringlib_split(
5848 (PyObject*) self, self->str, self->length,
5849 substring->str, substring->length,
5850 maxcount
5851 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852}
5853
Tim Petersced69f82003-09-16 20:30:58 +00005854static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005855PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005856 PyUnicodeObject *substring,
5857 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005858{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005859 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005860 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005861
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005862 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005863 return stringlib_rsplit_whitespace(
5864 (PyObject*) self, self->str, self->length, maxcount
5865 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005866
Antoine Pitrou64672132010-01-13 07:55:48 +00005867 return stringlib_rsplit(
5868 (PyObject*) self, self->str, self->length,
5869 substring->str, substring->length,
5870 maxcount
5871 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005872}
5873
5874static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005876 PyUnicodeObject *str1,
5877 PyUnicodeObject *str2,
5878 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879{
5880 PyUnicodeObject *u;
5881
5882 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005883 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005884 else if (maxcount == 0 || self->length == 0)
5885 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886
Fredrik Lundh347ee272006-05-24 16:35:18 +00005887 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005888 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005889 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005890 if (str1->length == 0)
5891 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005892 if (str1->length == 1) {
5893 /* replace characters */
5894 Py_UNICODE u1, u2;
5895 if (!findchar(self->str, self->length, str1->str[0]))
5896 goto nothing;
5897 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5898 if (!u)
5899 return NULL;
5900 Py_UNICODE_COPY(u->str, self->str, self->length);
5901 u1 = str1->str[0];
5902 u2 = str2->str[0];
5903 for (i = 0; i < u->length; i++)
5904 if (u->str[i] == u1) {
5905 if (--maxcount < 0)
5906 break;
5907 u->str[i] = u2;
5908 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005910 i = stringlib_find(
5911 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005913 if (i < 0)
5914 goto nothing;
5915 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5916 if (!u)
5917 return NULL;
5918 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005919
5920 /* change everything in-place, starting with this one */
5921 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5922 i += str1->length;
5923
5924 while ( --maxcount > 0) {
5925 i = stringlib_find(self->str+i, self->length-i,
5926 str1->str, str1->length,
5927 i);
5928 if (i == -1)
5929 break;
5930 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5931 i += str1->length;
5932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005935
Brett Cannona7f13ee2010-05-04 01:16:51 +00005936 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005937 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 Py_UNICODE *p;
5939
5940 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005941 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5942 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005943 if (n == 0)
5944 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005945 /* new_size = self->length + n * (str2->length - str1->length)); */
5946 delta = (str2->length - str1->length);
5947 if (delta == 0) {
5948 new_size = self->length;
5949 } else {
5950 product = n * (str2->length - str1->length);
5951 if ((product / (str2->length - str1->length)) != n) {
5952 PyErr_SetString(PyExc_OverflowError,
5953 "replace string is too long");
5954 return NULL;
5955 }
5956 new_size = self->length + product;
5957 if (new_size < 0) {
5958 PyErr_SetString(PyExc_OverflowError,
5959 "replace string is too long");
5960 return NULL;
5961 }
5962 }
5963 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005964 if (!u)
5965 return NULL;
5966 i = 0;
5967 p = u->str;
5968 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005969 while (n-- > 0) {
5970 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005971 j = stringlib_find(self->str+i, self->length-i,
5972 str1->str, str1->length,
5973 i);
5974 if (j == -1)
5975 break;
5976 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005977 /* copy unchanged part [i:j] */
5978 Py_UNICODE_COPY(p, self->str+i, j-i);
5979 p += j - i;
5980 }
5981 /* copy substitution string */
5982 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005983 Py_UNICODE_COPY(p, str2->str, str2->length);
5984 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005985 }
5986 i = j + str1->length;
5987 }
5988 if (i < self->length)
5989 /* copy tail [i:] */
5990 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005991 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005992 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005993 while (n > 0) {
5994 Py_UNICODE_COPY(p, str2->str, str2->length);
5995 p += str2->length;
5996 if (--n <= 0)
5997 break;
5998 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006000 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 }
6002 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006004
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006005 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006006 /* nothing to replace; return original string (when possible) */
6007 if (PyUnicode_CheckExact(self)) {
6008 Py_INCREF(self);
6009 return (PyObject *) self;
6010 }
6011 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012}
6013
6014/* --- Unicode Object Methods --------------------------------------------- */
6015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006016PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006017 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018\n\
6019Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006020characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
6022static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006023unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 return fixup(self, fixtitle);
6026}
6027
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006028PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006029 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030\n\
6031Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00006032have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033
6034static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006035unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 return fixup(self, fixcapitalize);
6038}
6039
6040#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006041PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006042 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043\n\
6044Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006045normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046
6047static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006048unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049{
6050 PyObject *list;
6051 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006052 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 /* Split into words */
6055 list = split(self, NULL, -1);
6056 if (!list)
6057 return NULL;
6058
6059 /* Capitalize each word */
6060 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6061 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006062 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 if (item == NULL)
6064 goto onError;
6065 Py_DECREF(PyList_GET_ITEM(list, i));
6066 PyList_SET_ITEM(list, i, item);
6067 }
6068
6069 /* Join the words to form a new string */
6070 item = PyUnicode_Join(NULL, list);
6071
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006072 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 Py_DECREF(list);
6074 return (PyObject *)item;
6075}
6076#endif
6077
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006078/* Argument converter. Coerces to a single unicode character */
6079
6080static int
6081convert_uc(PyObject *obj, void *addr)
6082{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006083 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6084 PyObject *uniobj;
6085 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006086
Benjamin Peterson857ce152009-01-31 16:29:18 +00006087 uniobj = PyUnicode_FromObject(obj);
6088 if (uniobj == NULL) {
6089 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006090 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006091 return 0;
6092 }
6093 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6094 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006095 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006096 Py_DECREF(uniobj);
6097 return 0;
6098 }
6099 unistr = PyUnicode_AS_UNICODE(uniobj);
6100 *fillcharloc = unistr[0];
6101 Py_DECREF(uniobj);
6102 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006103}
6104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006105PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006106 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006108Return S centered in a Unicode string of length width. Padding is\n\
6109done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110
6111static PyObject *
6112unicode_center(PyUnicodeObject *self, PyObject *args)
6113{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006114 Py_ssize_t marg, left;
6115 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006116 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117
Thomas Woutersde017742006-02-16 19:34:37 +00006118 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 return NULL;
6120
Tim Peters7a29bd52001-09-12 03:03:31 +00006121 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 Py_INCREF(self);
6123 return (PyObject*) self;
6124 }
6125
6126 marg = width - self->length;
6127 left = marg / 2 + (marg & width & 1);
6128
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006129 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130}
6131
Marc-André Lemburge5034372000-08-08 08:04:29 +00006132#if 0
6133
6134/* This code should go into some future Unicode collation support
6135 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006136 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006137
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006138/* speedy UTF-16 code point order comparison */
6139/* gleaned from: */
6140/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6141
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006142static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006143{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006144 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006145 0, 0, 0, 0, 0, 0, 0, 0,
6146 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006147 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006148};
6149
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150static int
6151unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6152{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006153 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006154
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 Py_UNICODE *s1 = str1->str;
6156 Py_UNICODE *s2 = str2->str;
6157
6158 len1 = str1->length;
6159 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006160
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006162 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006163
6164 c1 = *s1++;
6165 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006166
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006167 if (c1 > (1<<11) * 26)
6168 c1 += utf16Fixup[c1>>11];
6169 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006170 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006171 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006172
6173 if (c1 != c2)
6174 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006175
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006176 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 }
6178
6179 return (len1 < len2) ? -1 : (len1 != len2);
6180}
6181
Marc-André Lemburge5034372000-08-08 08:04:29 +00006182#else
6183
6184static int
6185unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6186{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006187 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006188
6189 Py_UNICODE *s1 = str1->str;
6190 Py_UNICODE *s2 = str2->str;
6191
6192 len1 = str1->length;
6193 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006194
Marc-André Lemburge5034372000-08-08 08:04:29 +00006195 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006196 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006197
Fredrik Lundh45714e92001-06-26 16:39:36 +00006198 c1 = *s1++;
6199 c2 = *s2++;
6200
6201 if (c1 != c2)
6202 return (c1 < c2) ? -1 : 1;
6203
Marc-André Lemburge5034372000-08-08 08:04:29 +00006204 len1--; len2--;
6205 }
6206
6207 return (len1 < len2) ? -1 : (len1 != len2);
6208}
6209
6210#endif
6211
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006213 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214{
6215 PyUnicodeObject *u = NULL, *v = NULL;
6216 int result;
6217
6218 /* Coerce the two arguments */
6219 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6220 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006221 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6223 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006224 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225
Thomas Wouters7e474022000-07-16 12:04:32 +00006226 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006228 Py_DECREF(u);
6229 Py_DECREF(v);
6230 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 }
6232
6233 result = unicode_compare(u, v);
6234
6235 Py_DECREF(u);
6236 Py_DECREF(v);
6237 return result;
6238
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006239 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 Py_XDECREF(u);
6241 Py_XDECREF(v);
6242 return -1;
6243}
6244
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006245PyObject *PyUnicode_RichCompare(PyObject *left,
6246 PyObject *right,
6247 int op)
6248{
6249 int result;
6250
6251 result = PyUnicode_Compare(left, right);
6252 if (result == -1 && PyErr_Occurred())
6253 goto onError;
6254
6255 /* Convert the return value to a Boolean */
6256 switch (op) {
6257 case Py_EQ:
6258 result = (result == 0);
6259 break;
6260 case Py_NE:
6261 result = (result != 0);
6262 break;
6263 case Py_LE:
6264 result = (result <= 0);
6265 break;
6266 case Py_GE:
6267 result = (result >= 0);
6268 break;
6269 case Py_LT:
6270 result = (result == -1);
6271 break;
6272 case Py_GT:
6273 result = (result == 1);
6274 break;
6275 }
6276 return PyBool_FromLong(result);
6277
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006278 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006279
6280 /* Standard case
6281
6282 Type errors mean that PyUnicode_FromObject() could not convert
6283 one of the arguments (usually the right hand side) to Unicode,
6284 ie. we can't handle the comparison request. However, it is
6285 possible that the other object knows a comparison method, which
6286 is why we return Py_NotImplemented to give the other object a
6287 chance.
6288
6289 */
6290 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6291 PyErr_Clear();
6292 Py_INCREF(Py_NotImplemented);
6293 return Py_NotImplemented;
6294 }
6295 if (op != Py_EQ && op != Py_NE)
6296 return NULL;
6297
6298 /* Equality comparison.
6299
6300 This is a special case: we silence any PyExc_UnicodeDecodeError
6301 and instead turn it into a PyErr_UnicodeWarning.
6302
6303 */
6304 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6305 return NULL;
6306 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006307 if (PyErr_Warn(PyExc_UnicodeWarning,
6308 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006309 "Unicode equal comparison "
6310 "failed to convert both arguments to Unicode - "
6311 "interpreting them as being unequal" :
6312 "Unicode unequal comparison "
6313 "failed to convert both arguments to Unicode - "
6314 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006315 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006316 return NULL;
6317 result = (op == Py_NE);
6318 return PyBool_FromLong(result);
6319}
6320
Guido van Rossum403d68b2000-03-13 15:55:09 +00006321int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006322 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006323{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006324 PyObject *str, *sub;
6325 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006326
6327 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006328 sub = PyUnicode_FromObject(element);
6329 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006330 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006331 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006332
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006333 str = PyUnicode_FromObject(container);
6334 if (!str) {
6335 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006336 return -1;
6337 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006338
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006339 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006340
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006341 Py_DECREF(str);
6342 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006343
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006344 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006345}
6346
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347/* Concat to string or Unicode object giving a new Unicode object. */
6348
6349PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006350 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351{
6352 PyUnicodeObject *u = NULL, *v = NULL, *w;
6353
6354 /* Coerce the two arguments */
6355 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6356 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006357 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6359 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006360 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361
6362 /* Shortcuts */
6363 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006364 Py_DECREF(v);
6365 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 }
6367 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006368 Py_DECREF(u);
6369 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 }
6371
6372 /* Concat the two Unicode strings */
6373 w = _PyUnicode_New(u->length + v->length);
6374 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006375 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376 Py_UNICODE_COPY(w->str, u->str, u->length);
6377 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6378
6379 Py_DECREF(u);
6380 Py_DECREF(v);
6381 return (PyObject *)w;
6382
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006383 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 Py_XDECREF(u);
6385 Py_XDECREF(v);
6386 return NULL;
6387}
6388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006389PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006390 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006392Return the number of non-overlapping occurrences of substring sub in\n\
6393Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006394interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395
6396static PyObject *
6397unicode_count(PyUnicodeObject *self, PyObject *args)
6398{
6399 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006400 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006401 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 PyObject *result;
6403
Jesus Cea44e81682011-04-20 16:39:15 +02006404 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6405 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006406 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006407
Antoine Pitrou64672132010-01-13 07:55:48 +00006408 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006409 result = PyInt_FromSsize_t(
6410 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006411 substring->str, substring->length,
6412 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006413 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414
6415 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006416
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 return result;
6418}
6419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006420PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006421 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006423Encodes S using the codec registered for encoding. encoding defaults\n\
6424to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006425handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6427'xmlcharrefreplace' as well as any other name registered with\n\
6428codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429
6430static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006431unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006433 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 char *encoding = NULL;
6435 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006436 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006437
Benjamin Peterson332d7212009-09-18 21:14:55 +00006438 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6439 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006441 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006442 if (v == NULL)
6443 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006444 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006445 PyErr_Format(PyExc_TypeError,
6446 "encoder did not return a string/unicode object "
6447 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006448 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006449 Py_DECREF(v);
6450 return NULL;
6451 }
6452 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006453
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006454 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006455 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006456}
6457
6458PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006459 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006460\n\
6461Decodes S using the codec registered for encoding. encoding defaults\n\
6462to the default encoding. errors may be given to set a different error\n\
6463handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6464a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006465as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006466able to handle UnicodeDecodeErrors.");
6467
6468static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006469unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006470{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006471 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006472 char *encoding = NULL;
6473 char *errors = NULL;
6474 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006475
Benjamin Peterson332d7212009-09-18 21:14:55 +00006476 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6477 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006478 return NULL;
6479 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006480 if (v == NULL)
6481 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006482 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006483 PyErr_Format(PyExc_TypeError,
6484 "decoder did not return a string/unicode object "
6485 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006486 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006487 Py_DECREF(v);
6488 return NULL;
6489 }
6490 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006491
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006492 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006493 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494}
6495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006496PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006497 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498\n\
6499Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006500If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501
6502static PyObject*
6503unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6504{
6505 Py_UNICODE *e;
6506 Py_UNICODE *p;
6507 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006508 Py_UNICODE *qe;
6509 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 PyUnicodeObject *u;
6511 int tabsize = 8;
6512
6513 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515
Thomas Wouters7e474022000-07-16 12:04:32 +00006516 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006517 i = 0; /* chars up to and including most recent \n or \r */
6518 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6519 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520 for (p = self->str; p < e; p++)
6521 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006522 if (tabsize > 0) {
6523 incr = tabsize - (j % tabsize); /* cannot overflow */
6524 if (j > PY_SSIZE_T_MAX - incr)
6525 goto overflow1;
6526 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006527 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006528 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006530 if (j > PY_SSIZE_T_MAX - 1)
6531 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 j++;
6533 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006534 if (i > PY_SSIZE_T_MAX - j)
6535 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006537 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 }
6539 }
6540
Guido van Rossum5bdff602008-03-11 21:18:06 +00006541 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006542 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006543
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 /* Second pass: create output string and fill it */
6545 u = _PyUnicode_New(i + j);
6546 if (!u)
6547 return NULL;
6548
Guido van Rossum5bdff602008-03-11 21:18:06 +00006549 j = 0; /* same as in first pass */
6550 q = u->str; /* next output char */
6551 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552
6553 for (p = self->str; p < e; p++)
6554 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006555 if (tabsize > 0) {
6556 i = tabsize - (j % tabsize);
6557 j += i;
6558 while (i--) {
6559 if (q >= qe)
6560 goto overflow2;
6561 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006562 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006563 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006564 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006565 else {
6566 if (q >= qe)
6567 goto overflow2;
6568 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006569 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 if (*p == '\n' || *p == '\r')
6571 j = 0;
6572 }
6573
6574 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006575
6576 overflow2:
6577 Py_DECREF(u);
6578 overflow1:
6579 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581}
6582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006583PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006584 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585\n\
6586Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006587such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588arguments start and end are interpreted as in slice notation.\n\
6589\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006590Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591
6592static PyObject *
6593unicode_find(PyUnicodeObject *self, PyObject *args)
6594{
Jesus Cea44e81682011-04-20 16:39:15 +02006595 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006596 Py_ssize_t start;
6597 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006598 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599
Jesus Cea44e81682011-04-20 16:39:15 +02006600 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6601 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006604 result = stringlib_find_slice(
6605 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6606 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6607 start, end
6608 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609
6610 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006611
6612 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613}
6614
6615static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006616unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617{
6618 if (index < 0 || index >= self->length) {
6619 PyErr_SetString(PyExc_IndexError, "string index out of range");
6620 return NULL;
6621 }
6622
6623 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6624}
6625
6626static long
6627unicode_hash(PyUnicodeObject *self)
6628{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006629 /* Since Unicode objects compare equal to their ASCII string
6630 counterparts, they should use the individual character values
6631 as basis for their hash value. This is needed to assure that
6632 strings and Unicode objects behave in the same way as
6633 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634
Martin v. Löwis18e16552006-02-15 17:27:45 +00006635 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006636 register Py_UNICODE *p;
6637 register long x;
6638
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006639#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006640 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006641#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006643 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006644 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006645 /*
6646 We make the hash of the empty string be 0, rather than using
6647 (prefix ^ suffix), since this slightly obfuscates the hash secret
6648 */
6649 if (len == 0) {
6650 self->hash = 0;
6651 return 0;
6652 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006653 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006654 x = _Py_HashSecret.prefix;
6655 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006656 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006657 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006658 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006659 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006660 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006661 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006662 self->hash = x;
6663 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664}
6665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006666PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006667 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006669Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670
6671static PyObject *
6672unicode_index(PyUnicodeObject *self, PyObject *args)
6673{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006674 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006675 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006676 Py_ssize_t start;
6677 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678
Jesus Cea44e81682011-04-20 16:39:15 +02006679 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6680 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006683 result = stringlib_find_slice(
6684 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6685 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6686 start, end
6687 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
6689 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006690
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 if (result < 0) {
6692 PyErr_SetString(PyExc_ValueError, "substring not found");
6693 return NULL;
6694 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006695
Martin v. Löwis18e16552006-02-15 17:27:45 +00006696 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697}
6698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006699PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006700 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006702Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006703at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704
6705static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006706unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707{
6708 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6709 register const Py_UNICODE *e;
6710 int cased;
6711
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 /* Shortcut for single character strings */
6713 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006714 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006716 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006717 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006718 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006719
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 e = p + PyUnicode_GET_SIZE(self);
6721 cased = 0;
6722 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006723 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006724
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006725 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6726 return PyBool_FromLong(0);
6727 else if (!cased && Py_UNICODE_ISLOWER(ch))
6728 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006730 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731}
6732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006733PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006734 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006736Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006737at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738
6739static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006740unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741{
6742 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6743 register const Py_UNICODE *e;
6744 int cased;
6745
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 /* Shortcut for single character strings */
6747 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006748 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006750 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006751 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006752 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006753
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 e = p + PyUnicode_GET_SIZE(self);
6755 cased = 0;
6756 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006757 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006758
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006759 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6760 return PyBool_FromLong(0);
6761 else if (!cased && Py_UNICODE_ISUPPER(ch))
6762 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006764 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765}
6766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006767PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006768 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006770Return True if S is a titlecased string and there is at least one\n\
6771character in S, i.e. upper- and titlecase characters may only\n\
6772follow uncased characters and lowercase characters only cased ones.\n\
6773Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774
6775static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006776unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777{
6778 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6779 register const Py_UNICODE *e;
6780 int cased, previous_is_cased;
6781
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 /* Shortcut for single character strings */
6783 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006784 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6785 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006787 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006788 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006789 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006790
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 e = p + PyUnicode_GET_SIZE(self);
6792 cased = 0;
6793 previous_is_cased = 0;
6794 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006795 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006796
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006797 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6798 if (previous_is_cased)
6799 return PyBool_FromLong(0);
6800 previous_is_cased = 1;
6801 cased = 1;
6802 }
6803 else if (Py_UNICODE_ISLOWER(ch)) {
6804 if (!previous_is_cased)
6805 return PyBool_FromLong(0);
6806 previous_is_cased = 1;
6807 cased = 1;
6808 }
6809 else
6810 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006812 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813}
6814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006815PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006816 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006818Return True if all characters in S are whitespace\n\
6819and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820
6821static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006822unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823{
6824 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6825 register const Py_UNICODE *e;
6826
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 /* Shortcut for single character strings */
6828 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006829 Py_UNICODE_ISSPACE(*p))
6830 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006832 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006833 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006834 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006835
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 e = p + PyUnicode_GET_SIZE(self);
6837 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006838 if (!Py_UNICODE_ISSPACE(*p))
6839 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006841 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842}
6843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006844PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006845 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006846\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006847Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006848and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006849
6850static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006851unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006852{
6853 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6854 register const Py_UNICODE *e;
6855
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006856 /* Shortcut for single character strings */
6857 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006858 Py_UNICODE_ISALPHA(*p))
6859 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006860
6861 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006862 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006863 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006864
6865 e = p + PyUnicode_GET_SIZE(self);
6866 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006867 if (!Py_UNICODE_ISALPHA(*p))
6868 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006869 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006871}
6872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006873PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006874 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006875\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006876Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006877and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006878
6879static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006880unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006881{
6882 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6883 register const Py_UNICODE *e;
6884
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006885 /* Shortcut for single character strings */
6886 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006887 Py_UNICODE_ISALNUM(*p))
6888 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006889
6890 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006891 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006892 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006893
6894 e = p + PyUnicode_GET_SIZE(self);
6895 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006896 if (!Py_UNICODE_ISALNUM(*p))
6897 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006898 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006899 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006900}
6901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006902PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006903 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006905Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006906False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907
6908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006909unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910{
6911 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6912 register const Py_UNICODE *e;
6913
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 /* Shortcut for single character strings */
6915 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006916 Py_UNICODE_ISDECIMAL(*p))
6917 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006919 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006920 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006921 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006922
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 e = p + PyUnicode_GET_SIZE(self);
6924 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006925 if (!Py_UNICODE_ISDECIMAL(*p))
6926 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006928 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929}
6930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006931PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006932 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006934Return True if all characters in S are digits\n\
6935and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936
6937static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006938unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939{
6940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6941 register const Py_UNICODE *e;
6942
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 /* Shortcut for single character strings */
6944 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006945 Py_UNICODE_ISDIGIT(*p))
6946 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006948 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006949 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006950 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006951
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 e = p + PyUnicode_GET_SIZE(self);
6953 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006954 if (!Py_UNICODE_ISDIGIT(*p))
6955 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006957 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958}
6959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006960PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006961 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006963Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006964False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965
6966static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006967unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968{
6969 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6970 register const Py_UNICODE *e;
6971
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 /* Shortcut for single character strings */
6973 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006974 Py_UNICODE_ISNUMERIC(*p))
6975 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006977 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006978 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006979 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006980
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981 e = p + PyUnicode_GET_SIZE(self);
6982 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006983 if (!Py_UNICODE_ISNUMERIC(*p))
6984 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006986 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987}
6988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006989PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006990 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991\n\
6992Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006993iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994
6995static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006996unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006998 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999}
7000
Martin v. Löwis18e16552006-02-15 17:27:45 +00007001static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002unicode_length(PyUnicodeObject *self)
7003{
7004 return self->length;
7005}
7006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007007PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007008 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007010Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007011done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012
7013static PyObject *
7014unicode_ljust(PyUnicodeObject *self, PyObject *args)
7015{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007016 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007017 Py_UNICODE fillchar = ' ';
7018
Martin v. Löwis412fb672006-04-13 06:34:32 +00007019 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 return NULL;
7021
Tim Peters7a29bd52001-09-12 03:03:31 +00007022 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 Py_INCREF(self);
7024 return (PyObject*) self;
7025 }
7026
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007027 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028}
7029
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007030PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007031 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007033Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034
7035static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007036unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 return fixup(self, fixlower);
7039}
7040
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007041#define LEFTSTRIP 0
7042#define RIGHTSTRIP 1
7043#define BOTHSTRIP 2
7044
7045/* Arrays indexed by above */
7046static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7047
7048#define STRIPNAME(i) (stripformat[i]+3)
7049
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007050/* externally visible for str.strip(unicode) */
7051PyObject *
7052_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7053{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007054 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7055 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7056 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7057 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7058 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007059
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007060 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007061
Benjamin Peterson857ce152009-01-31 16:29:18 +00007062 i = 0;
7063 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007064 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7065 i++;
7066 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007067 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007068
Benjamin Peterson857ce152009-01-31 16:29:18 +00007069 j = len;
7070 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007071 do {
7072 j--;
7073 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7074 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007075 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007076
Benjamin Peterson857ce152009-01-31 16:29:18 +00007077 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007078 Py_INCREF(self);
7079 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007080 }
7081 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007082 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007083}
7084
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085
7086static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007087do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007089 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7090 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007091
Benjamin Peterson857ce152009-01-31 16:29:18 +00007092 i = 0;
7093 if (striptype != RIGHTSTRIP) {
7094 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7095 i++;
7096 }
7097 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007098
Benjamin Peterson857ce152009-01-31 16:29:18 +00007099 j = len;
7100 if (striptype != LEFTSTRIP) {
7101 do {
7102 j--;
7103 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7104 j++;
7105 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007106
Benjamin Peterson857ce152009-01-31 16:29:18 +00007107 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7108 Py_INCREF(self);
7109 return (PyObject*)self;
7110 }
7111 else
7112 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113}
7114
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007115
7116static PyObject *
7117do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7118{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007119 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007120
Benjamin Peterson857ce152009-01-31 16:29:18 +00007121 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7122 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007123
Benjamin Peterson857ce152009-01-31 16:29:18 +00007124 if (sep != NULL && sep != Py_None) {
7125 if (PyUnicode_Check(sep))
7126 return _PyUnicode_XStrip(self, striptype, sep);
7127 else if (PyString_Check(sep)) {
7128 PyObject *res;
7129 sep = PyUnicode_FromObject(sep);
7130 if (sep==NULL)
7131 return NULL;
7132 res = _PyUnicode_XStrip(self, striptype, sep);
7133 Py_DECREF(sep);
7134 return res;
7135 }
7136 else {
7137 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007138 "%s arg must be None, unicode or str",
7139 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007140 return NULL;
7141 }
7142 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007143
Benjamin Peterson857ce152009-01-31 16:29:18 +00007144 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007145}
7146
7147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007148PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007149 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007150\n\
7151Return a copy of the string S with leading and trailing\n\
7152whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007153If chars is given and not None, remove characters in chars instead.\n\
7154If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007155
7156static PyObject *
7157unicode_strip(PyUnicodeObject *self, PyObject *args)
7158{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007159 if (PyTuple_GET_SIZE(args) == 0)
7160 return do_strip(self, BOTHSTRIP); /* Common case */
7161 else
7162 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007163}
7164
7165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007166PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007167 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007168\n\
7169Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007170If chars is given and not None, remove characters in chars instead.\n\
7171If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007172
7173static PyObject *
7174unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7175{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007176 if (PyTuple_GET_SIZE(args) == 0)
7177 return do_strip(self, LEFTSTRIP); /* Common case */
7178 else
7179 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007180}
7181
7182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007183PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007184 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007185\n\
7186Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007187If chars is given and not None, remove characters in chars instead.\n\
7188If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007189
7190static PyObject *
7191unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7192{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007193 if (PyTuple_GET_SIZE(args) == 0)
7194 return do_strip(self, RIGHTSTRIP); /* Common case */
7195 else
7196 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007197}
7198
7199
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007201unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202{
7203 PyUnicodeObject *u;
7204 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007205 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007206 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207
7208 if (len < 0)
7209 len = 0;
7210
Tim Peters7a29bd52001-09-12 03:03:31 +00007211 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212 /* no repeat, return original string */
7213 Py_INCREF(str);
7214 return (PyObject*) str;
7215 }
Tim Peters8f422462000-09-09 06:13:41 +00007216
7217 /* ensure # of chars needed doesn't overflow int and # of bytes
7218 * needed doesn't overflow size_t
7219 */
7220 nchars = len * str->length;
7221 if (len && nchars / len != str->length) {
7222 PyErr_SetString(PyExc_OverflowError,
7223 "repeated string is too long");
7224 return NULL;
7225 }
7226 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7227 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7228 PyErr_SetString(PyExc_OverflowError,
7229 "repeated string is too long");
7230 return NULL;
7231 }
7232 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233 if (!u)
7234 return NULL;
7235
7236 p = u->str;
7237
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007238 if (str->length == 1 && len > 0) {
7239 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007240 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007241 Py_ssize_t done = 0; /* number of characters copied this far */
7242 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007243 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007244 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007245 }
7246 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007247 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007248 Py_UNICODE_COPY(p+done, p, n);
7249 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007250 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252
7253 return (PyObject*) u;
7254}
7255
7256PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007257 PyObject *subobj,
7258 PyObject *replobj,
7259 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260{
7261 PyObject *self;
7262 PyObject *str1;
7263 PyObject *str2;
7264 PyObject *result;
7265
7266 self = PyUnicode_FromObject(obj);
7267 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269 str1 = PyUnicode_FromObject(subobj);
7270 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007271 Py_DECREF(self);
7272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 }
7274 str2 = PyUnicode_FromObject(replobj);
7275 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007276 Py_DECREF(self);
7277 Py_DECREF(str1);
7278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 }
Tim Petersced69f82003-09-16 20:30:58 +00007280 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007281 (PyUnicodeObject *)str1,
7282 (PyUnicodeObject *)str2,
7283 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 Py_DECREF(self);
7285 Py_DECREF(str1);
7286 Py_DECREF(str2);
7287 return result;
7288}
7289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007290PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007291 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292\n\
7293Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007294old replaced by new. If the optional argument count is\n\
7295given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296
7297static PyObject*
7298unicode_replace(PyUnicodeObject *self, PyObject *args)
7299{
7300 PyUnicodeObject *str1;
7301 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007302 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303 PyObject *result;
7304
Martin v. Löwis18e16552006-02-15 17:27:45 +00007305 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 return NULL;
7307 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7308 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007309 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007311 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007312 Py_DECREF(str1);
7313 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315
7316 result = replace(self, str1, str2, maxcount);
7317
7318 Py_DECREF(str1);
7319 Py_DECREF(str2);
7320 return result;
7321}
7322
7323static
7324PyObject *unicode_repr(PyObject *unicode)
7325{
7326 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007327 PyUnicode_GET_SIZE(unicode),
7328 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329}
7330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007331PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007332 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333\n\
7334Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007335such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336arguments start and end are interpreted as in slice notation.\n\
7337\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007338Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339
7340static PyObject *
7341unicode_rfind(PyUnicodeObject *self, PyObject *args)
7342{
Jesus Cea44e81682011-04-20 16:39:15 +02007343 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007344 Py_ssize_t start;
7345 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007346 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347
Jesus Cea44e81682011-04-20 16:39:15 +02007348 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7349 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007350 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007352 result = stringlib_rfind_slice(
7353 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7354 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7355 start, end
7356 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357
7358 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007359
7360 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361}
7362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007363PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007364 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007366Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
7368static PyObject *
7369unicode_rindex(PyUnicodeObject *self, PyObject *args)
7370{
Jesus Cea44e81682011-04-20 16:39:15 +02007371 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007372 Py_ssize_t start;
7373 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007374 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375
Jesus Cea44e81682011-04-20 16:39:15 +02007376 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7377 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007380 result = stringlib_rfind_slice(
7381 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7382 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7383 start, end
7384 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385
7386 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007387
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 if (result < 0) {
7389 PyErr_SetString(PyExc_ValueError, "substring not found");
7390 return NULL;
7391 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007392 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393}
7394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007395PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007396 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007398Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007399done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400
7401static PyObject *
7402unicode_rjust(PyUnicodeObject *self, PyObject *args)
7403{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007404 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007405 Py_UNICODE fillchar = ' ';
7406
Martin v. Löwis412fb672006-04-13 06:34:32 +00007407 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408 return NULL;
7409
Tim Peters7a29bd52001-09-12 03:03:31 +00007410 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 Py_INCREF(self);
7412 return (PyObject*) self;
7413 }
7414
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007415 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416}
7417
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007419unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420{
7421 /* standard clamping */
7422 if (start < 0)
7423 start = 0;
7424 if (end < 0)
7425 end = 0;
7426 if (end > self->length)
7427 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007428 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 /* full slice, return original string */
7430 Py_INCREF(self);
7431 return (PyObject*) self;
7432 }
7433 if (start > end)
7434 start = end;
7435 /* copy slice */
7436 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007437 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438}
7439
7440PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007441 PyObject *sep,
7442 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443{
7444 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007445
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 s = PyUnicode_FromObject(s);
7447 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007448 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007449 if (sep != NULL) {
7450 sep = PyUnicode_FromObject(sep);
7451 if (sep == NULL) {
7452 Py_DECREF(s);
7453 return NULL;
7454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 }
7456
7457 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7458
7459 Py_DECREF(s);
7460 Py_XDECREF(sep);
7461 return result;
7462}
7463
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007464PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007465 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466\n\
7467Return a list of the words in S, using sep as the\n\
7468delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007469splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007470whitespace string is a separator and empty strings are\n\
7471removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472
7473static PyObject*
7474unicode_split(PyUnicodeObject *self, PyObject *args)
7475{
7476 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007477 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478
Martin v. Löwis18e16552006-02-15 17:27:45 +00007479 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 return NULL;
7481
7482 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007483 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007485 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007487 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488}
7489
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007490PyObject *
7491PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7492{
7493 PyObject* str_obj;
7494 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007495 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007496
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007497 str_obj = PyUnicode_FromObject(str_in);
7498 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007499 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007500 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007501 if (!sep_obj) {
7502 Py_DECREF(str_obj);
7503 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007504 }
7505
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007506 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007507 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7508 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7509 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007510
Fredrik Lundhb9479482006-05-26 17:22:38 +00007511 Py_DECREF(sep_obj);
7512 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007513
7514 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007515}
7516
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007517
7518PyObject *
7519PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7520{
7521 PyObject* str_obj;
7522 PyObject* sep_obj;
7523 PyObject* out;
7524
7525 str_obj = PyUnicode_FromObject(str_in);
7526 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007527 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007528 sep_obj = PyUnicode_FromObject(sep_in);
7529 if (!sep_obj) {
7530 Py_DECREF(str_obj);
7531 return NULL;
7532 }
7533
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007534 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007535 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7536 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7537 );
7538
7539 Py_DECREF(sep_obj);
7540 Py_DECREF(str_obj);
7541
7542 return out;
7543}
7544
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007545PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007546 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007547\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007548Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007549the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007550found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007551
7552static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007553unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007554{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007555 return PyUnicode_Partition((PyObject *)self, separator);
7556}
7557
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007558PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007559 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007560\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007561Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007562the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007563separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007564
7565static PyObject*
7566unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7567{
7568 return PyUnicode_RPartition((PyObject *)self, separator);
7569}
7570
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007571PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007572 PyObject *sep,
7573 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007574{
7575 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007576
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007577 s = PyUnicode_FromObject(s);
7578 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007579 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007580 if (sep != NULL) {
7581 sep = PyUnicode_FromObject(sep);
7582 if (sep == NULL) {
7583 Py_DECREF(s);
7584 return NULL;
7585 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007586 }
7587
7588 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7589
7590 Py_DECREF(s);
7591 Py_XDECREF(sep);
7592 return result;
7593}
7594
7595PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007596 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007597\n\
7598Return a list of the words in S, using sep as the\n\
7599delimiter string, starting at the end of the string and\n\
7600working to the front. If maxsplit is given, at most maxsplit\n\
7601splits are done. If sep is not specified, any whitespace string\n\
7602is a separator.");
7603
7604static PyObject*
7605unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7606{
7607 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007608 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007609
Martin v. Löwis18e16552006-02-15 17:27:45 +00007610 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007611 return NULL;
7612
7613 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007614 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007615 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007616 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007617 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007618 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007619}
7620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007621PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007622 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623\n\
7624Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007625Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007626is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
7628static PyObject*
7629unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7630{
Guido van Rossum86662912000-04-11 15:38:46 +00007631 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632
Guido van Rossum86662912000-04-11 15:38:46 +00007633 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634 return NULL;
7635
Guido van Rossum86662912000-04-11 15:38:46 +00007636 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637}
7638
7639static
7640PyObject *unicode_str(PyUnicodeObject *self)
7641{
Fred Drakee4315f52000-05-09 19:53:39 +00007642 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643}
7644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007645PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007646 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647\n\
7648Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007649and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650
7651static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007652unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654 return fixup(self, fixswapcase);
7655}
7656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007657PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007658 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659\n\
7660Return a copy of the string S, where all characters have been mapped\n\
7661through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007662Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7663Unmapped characters are left untouched. Characters mapped to None\n\
7664are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665
7666static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007667unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668{
Tim Petersced69f82003-09-16 20:30:58 +00007669 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007670 self->length,
7671 table,
7672 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673}
7674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007675PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007676 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007678Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679
7680static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007681unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683 return fixup(self, fixupper);
7684}
7685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007686PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007687 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688\n\
Georg Brandl98064072008-09-09 19:26:00 +00007689Pad a numeric string S with zeros on the left, to fill a field\n\
7690of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691
7692static PyObject *
7693unicode_zfill(PyUnicodeObject *self, PyObject *args)
7694{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007695 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696 PyUnicodeObject *u;
7697
Martin v. Löwis18e16552006-02-15 17:27:45 +00007698 Py_ssize_t width;
7699 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 return NULL;
7701
7702 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007703 if (PyUnicode_CheckExact(self)) {
7704 Py_INCREF(self);
7705 return (PyObject*) self;
7706 }
7707 else
7708 return PyUnicode_FromUnicode(
7709 PyUnicode_AS_UNICODE(self),
7710 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007711 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712 }
7713
7714 fill = width - self->length;
7715
7716 u = pad(self, fill, 0, '0');
7717
Walter Dörwald068325e2002-04-15 13:36:47 +00007718 if (u == NULL)
7719 return NULL;
7720
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721 if (u->str[fill] == '+' || u->str[fill] == '-') {
7722 /* move sign to beginning of string */
7723 u->str[0] = u->str[fill];
7724 u->str[fill] = '0';
7725 }
7726
7727 return (PyObject*) u;
7728}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729
7730#if 0
7731static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007732free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007734 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735}
7736#endif
7737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007738PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007739 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007741Return True if S starts with the specified prefix, False otherwise.\n\
7742With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007743With optional end, stop comparing S at that position.\n\
7744prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745
7746static PyObject *
7747unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007748 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749{
Georg Brandl24250812006-06-09 18:45:48 +00007750 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007752 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007753 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007754 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
Jesus Cea44e81682011-04-20 16:39:15 +02007756 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007757 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007758 if (PyTuple_Check(subobj)) {
7759 Py_ssize_t i;
7760 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7761 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007762 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007763 if (substring == NULL)
7764 return NULL;
7765 result = tailmatch(self, substring, start, end, -1);
7766 Py_DECREF(substring);
7767 if (result) {
7768 Py_RETURN_TRUE;
7769 }
7770 }
7771 /* nothing matched */
7772 Py_RETURN_FALSE;
7773 }
7774 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007775 if (substring == NULL) {
7776 if (PyErr_ExceptionMatches(PyExc_TypeError))
7777 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7778 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007779 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007780 }
Georg Brandl24250812006-06-09 18:45:48 +00007781 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007783 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784}
7785
7786
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007787PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007788 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007790Return True if S ends with the specified suffix, False otherwise.\n\
7791With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007792With optional end, stop comparing S at that position.\n\
7793suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794
7795static PyObject *
7796unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007797 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798{
Georg Brandl24250812006-06-09 18:45:48 +00007799 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007801 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007802 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007803 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
Jesus Cea44e81682011-04-20 16:39:15 +02007805 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007806 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007807 if (PyTuple_Check(subobj)) {
7808 Py_ssize_t i;
7809 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7810 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007811 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007812 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007813 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007814 result = tailmatch(self, substring, start, end, +1);
7815 Py_DECREF(substring);
7816 if (result) {
7817 Py_RETURN_TRUE;
7818 }
7819 }
7820 Py_RETURN_FALSE;
7821 }
7822 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007823 if (substring == NULL) {
7824 if (PyErr_ExceptionMatches(PyExc_TypeError))
7825 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7826 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007827 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007828 }
Georg Brandl24250812006-06-09 18:45:48 +00007829 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007831 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832}
7833
7834
Eric Smitha9f7d622008-02-17 19:46:49 +00007835/* Implements do_string_format, which is unicode because of stringlib */
7836#include "stringlib/string_format.h"
7837
7838PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007839 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007840\n\
Eric Smith6c840852010-11-06 19:43:44 +00007841Return a formatted version of S, using substitutions from args and kwargs.\n\
7842The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007843
Eric Smithdc13b792008-05-30 18:10:04 +00007844static PyObject *
7845unicode__format__(PyObject *self, PyObject *args)
7846{
7847 PyObject *format_spec;
7848 PyObject *result = NULL;
7849 PyObject *tmp = NULL;
7850
7851 /* If 2.x, convert format_spec to the same type as value */
7852 /* This is to allow things like u''.format('') */
7853 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7854 goto done;
7855 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7856 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007857 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007858 goto done;
7859 }
7860 tmp = PyObject_Unicode(format_spec);
7861 if (tmp == NULL)
7862 goto done;
7863 format_spec = tmp;
7864
7865 result = _PyUnicode_FormatAdvanced(self,
7866 PyUnicode_AS_UNICODE(format_spec),
7867 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007868 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007869 Py_XDECREF(tmp);
7870 return result;
7871}
7872
Eric Smitha9f7d622008-02-17 19:46:49 +00007873PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007874 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007875\n\
Eric Smith6c840852010-11-06 19:43:44 +00007876Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007877
Robert Schuppenies901c9972008-06-10 10:10:31 +00007878static PyObject *
7879unicode__sizeof__(PyUnicodeObject *v)
7880{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007881 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7882 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007883}
7884
7885PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007886 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007887\n\
7888");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007889
7890static PyObject *
7891unicode_getnewargs(PyUnicodeObject *v)
7892{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007893 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007894}
7895
7896
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007898 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007899 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7900 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007901 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007902 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7903 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7904 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7905 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7906 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7907 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7908 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007909 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007910 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7911 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7912 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007913 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007914 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007915/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7916 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7917 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7918 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007919 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007920 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007921 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007922 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007923 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7924 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7925 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7926 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7927 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7928 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7929 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7930 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7931 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7932 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7933 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7934 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7935 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7936 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007937 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007938 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7939 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7940 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7941 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007942 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007943#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007944 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945#endif
7946
7947#if 0
7948 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007949 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950#endif
7951
Benjamin Peterson857ce152009-01-31 16:29:18 +00007952 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953 {NULL, NULL}
7954};
7955
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007956static PyObject *
7957unicode_mod(PyObject *v, PyObject *w)
7958{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007959 if (!PyUnicode_Check(v)) {
7960 Py_INCREF(Py_NotImplemented);
7961 return Py_NotImplemented;
7962 }
7963 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007964}
7965
7966static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007967 0, /*nb_add*/
7968 0, /*nb_subtract*/
7969 0, /*nb_multiply*/
7970 0, /*nb_divide*/
7971 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007972};
7973
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007975 (lenfunc) unicode_length, /* sq_length */
7976 PyUnicode_Concat, /* sq_concat */
7977 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7978 (ssizeargfunc) unicode_getitem, /* sq_item */
7979 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7980 0, /* sq_ass_item */
7981 0, /* sq_ass_slice */
7982 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983};
7984
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007985static PyObject*
7986unicode_subscript(PyUnicodeObject* self, PyObject* item)
7987{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007988 if (PyIndex_Check(item)) {
7989 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007990 if (i == -1 && PyErr_Occurred())
7991 return NULL;
7992 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007993 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007994 return unicode_getitem(self, i);
7995 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007996 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007997 Py_UNICODE* source_buf;
7998 Py_UNICODE* result_buf;
7999 PyObject* result;
8000
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008001 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008002 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008003 return NULL;
8004 }
8005
8006 if (slicelength <= 0) {
8007 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008008 } else if (start == 0 && step == 1 && slicelength == self->length &&
8009 PyUnicode_CheckExact(self)) {
8010 Py_INCREF(self);
8011 return (PyObject *)self;
8012 } else if (step == 1) {
8013 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008014 } else {
8015 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008016 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8017 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008018
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008019 if (result_buf == NULL)
8020 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008021
8022 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8023 result_buf[i] = source_buf[cur];
8024 }
Tim Petersced69f82003-09-16 20:30:58 +00008025
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008026 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008027 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008028 return result;
8029 }
8030 } else {
8031 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8032 return NULL;
8033 }
8034}
8035
8036static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008037 (lenfunc)unicode_length, /* mp_length */
8038 (binaryfunc)unicode_subscript, /* mp_subscript */
8039 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008040};
8041
Martin v. Löwis18e16552006-02-15 17:27:45 +00008042static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008044 Py_ssize_t index,
8045 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046{
8047 if (index != 0) {
8048 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008049 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 return -1;
8051 }
8052 *ptr = (void *) self->str;
8053 return PyUnicode_GET_DATA_SIZE(self);
8054}
8055
Martin v. Löwis18e16552006-02-15 17:27:45 +00008056static Py_ssize_t
8057unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008058 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059{
8060 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008061 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 return -1;
8063}
8064
8065static int
8066unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008067 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068{
8069 if (lenp)
8070 *lenp = PyUnicode_GET_DATA_SIZE(self);
8071 return 1;
8072}
8073
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008074static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008076 Py_ssize_t index,
8077 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078{
8079 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008080
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 if (index != 0) {
8082 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008083 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084 return -1;
8085 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008086 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008088 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008089 *ptr = (void *) PyString_AS_STRING(str);
8090 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091}
8092
8093/* Helpers for PyUnicode_Format() */
8094
8095static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008096getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008098 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008100 (*p_argidx)++;
8101 if (arglen < 0)
8102 return args;
8103 else
8104 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105 }
8106 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008107 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108 return NULL;
8109}
8110
8111#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008112#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008114#define F_ALT (1<<3)
8115#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116
Martin v. Löwis18e16552006-02-15 17:27:45 +00008117static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008118strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008120 register Py_ssize_t i;
8121 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008123 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125 return len;
8126}
8127
Neal Norwitzfc76d632006-01-10 06:03:13 +00008128static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008129longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8130{
Tim Peters15231542006-02-16 01:08:01 +00008131 Py_ssize_t result;
8132
Neal Norwitzfc76d632006-01-10 06:03:13 +00008133 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008134 result = strtounicode(buffer, (char *)buffer);
8135 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008136}
8137
Guido van Rossum078151d2002-08-11 04:24:12 +00008138/* XXX To save some code duplication, formatfloat/long/int could have been
8139 shared with stringobject.c, converting from 8-bit to Unicode after the
8140 formatting is done. */
8141
Mark Dickinson18cfada2009-11-23 18:46:41 +00008142/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8143
8144static PyObject *
8145formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008147 char *p;
8148 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008150
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151 x = PyFloat_AsDouble(v);
8152 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008153 return NULL;
8154
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008156 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008157
Mark Dickinson18cfada2009-11-23 18:46:41 +00008158 p = PyOS_double_to_string(x, type, prec,
8159 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8160 if (p == NULL)
8161 return NULL;
8162 result = PyUnicode_FromStringAndSize(p, strlen(p));
8163 PyMem_Free(p);
8164 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165}
8166
Tim Peters38fd5b62000-09-21 05:43:11 +00008167static PyObject*
8168formatlong(PyObject *val, int flags, int prec, int type)
8169{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008170 char *buf;
8171 int i, len;
8172 PyObject *str; /* temporary string object. */
8173 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008174
Benjamin Peterson857ce152009-01-31 16:29:18 +00008175 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8176 if (!str)
8177 return NULL;
8178 result = _PyUnicode_New(len);
8179 if (!result) {
8180 Py_DECREF(str);
8181 return NULL;
8182 }
8183 for (i = 0; i < len; i++)
8184 result->str[i] = buf[i];
8185 result->str[len] = 0;
8186 Py_DECREF(str);
8187 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008188}
8189
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190static int
8191formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008192 size_t buflen,
8193 int flags,
8194 int prec,
8195 int type,
8196 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008198 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008199 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8200 * + 1 + 1
8201 * = 24
8202 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008203 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008204 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205 long x;
8206
8207 x = PyInt_AsLong(v);
8208 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008209 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008210 if (x < 0 && type == 'u') {
8211 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008212 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008213 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8214 sign = "-";
8215 else
8216 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008218 prec = 1;
8219
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008220 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8221 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008222 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008223 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008224 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008225 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008226 return -1;
8227 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008228
8229 if ((flags & F_ALT) &&
8230 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008231 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008232 * of issues that cause pain:
8233 * - when 0 is being converted, the C standard leaves off
8234 * the '0x' or '0X', which is inconsistent with other
8235 * %#x/%#X conversions and inconsistent with Python's
8236 * hex() function
8237 * - there are platforms that violate the standard and
8238 * convert 0 with the '0x' or '0X'
8239 * (Metrowerks, Compaq Tru64)
8240 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008241 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008242 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008243 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008244 * We can achieve the desired consistency by inserting our
8245 * own '0x' or '0X' prefix, and substituting %x/%X in place
8246 * of %#x/%#X.
8247 *
8248 * Note that this is the same approach as used in
8249 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008250 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008251 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8252 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008253 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008254 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008255 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8256 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008257 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008258 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008259 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008260 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008261 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008262 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263}
8264
8265static int
8266formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008267 size_t buflen,
8268 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269{
Ezio Melotti32125152010-02-25 17:36:04 +00008270 PyObject *unistr;
8271 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008272 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008273 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008274 if (PyUnicode_GET_SIZE(v) != 1)
8275 goto onError;
8276 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008279 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008280 if (PyString_GET_SIZE(v) != 1)
8281 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008282 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8283 with a UnicodeDecodeError if 'char' is not decodable with the
8284 default encoding (usually ASCII, but it might be something else) */
8285 str = PyString_AS_STRING(v);
8286 if ((unsigned char)str[0] > 0x7F) {
8287 /* the char is not ASCII; try to decode the string using the
8288 default encoding and return -1 to let the UnicodeDecodeError
8289 be raised if the string can't be decoded */
8290 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8291 if (unistr == NULL)
8292 return -1;
8293 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8294 Py_DECREF(unistr);
8295 }
8296 else
8297 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008298 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299
8300 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008301 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008303 x = PyInt_AsLong(v);
8304 if (x == -1 && PyErr_Occurred())
8305 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008306#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008307 if (x < 0 || x > 0x10ffff) {
8308 PyErr_SetString(PyExc_OverflowError,
8309 "%c arg not in range(0x110000) "
8310 "(wide Python build)");
8311 return -1;
8312 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008313#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008314 if (x < 0 || x > 0xffff) {
8315 PyErr_SetString(PyExc_OverflowError,
8316 "%c arg not in range(0x10000) "
8317 "(narrow Python build)");
8318 return -1;
8319 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008320#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008321 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 }
8323 buf[1] = '\0';
8324 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008325
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008326 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008327 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008328 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008329 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330}
8331
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008332/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8333
Mark Dickinson18cfada2009-11-23 18:46:41 +00008334 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008335 chars are formatted. XXX This is a magic number. Each formatting
8336 routine does bounds checking to ensure no overflow, but a better
8337 solution may be to malloc a buffer of appropriate size for each
8338 format. For now, the current solution is sufficient.
8339*/
8340#define FORMATBUFLEN (size_t)120
8341
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008343 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344{
8345 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008346 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 int args_owned = 0;
8348 PyUnicodeObject *result = NULL;
8349 PyObject *dict = NULL;
8350 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008351
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008353 PyErr_BadInternalCall();
8354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 }
8356 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008357 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359 fmt = PyUnicode_AS_UNICODE(uformat);
8360 fmtcnt = PyUnicode_GET_SIZE(uformat);
8361
8362 reslen = rescnt = fmtcnt + 100;
8363 result = _PyUnicode_New(reslen);
8364 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008365 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 res = PyUnicode_AS_UNICODE(result);
8367
8368 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008369 arglen = PyTuple_Size(args);
8370 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371 }
8372 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008373 arglen = -1;
8374 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 }
Benjamin Petersonda2c7eb2013-03-23 22:32:00 -05008376 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8377 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008378 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379
8380 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008381 if (*fmt != '%') {
8382 if (--rescnt < 0) {
8383 rescnt = fmtcnt + 100;
8384 reslen += rescnt;
8385 if (_PyUnicode_Resize(&result, reslen) < 0)
8386 goto onError;
8387 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8388 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008389 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008390 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008391 }
8392 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008393 /* Got a format specifier */
8394 int flags = 0;
8395 Py_ssize_t width = -1;
8396 int prec = -1;
8397 Py_UNICODE c = '\0';
8398 Py_UNICODE fill;
8399 int isnumok;
8400 PyObject *v = NULL;
8401 PyObject *temp = NULL;
8402 Py_UNICODE *pbuf;
8403 Py_UNICODE sign;
8404 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008405 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008406
8407 fmt++;
8408 if (*fmt == '(') {
8409 Py_UNICODE *keystart;
8410 Py_ssize_t keylen;
8411 PyObject *key;
8412 int pcount = 1;
8413
8414 if (dict == NULL) {
8415 PyErr_SetString(PyExc_TypeError,
8416 "format requires a mapping");
8417 goto onError;
8418 }
8419 ++fmt;
8420 --fmtcnt;
8421 keystart = fmt;
8422 /* Skip over balanced parentheses */
8423 while (pcount > 0 && --fmtcnt >= 0) {
8424 if (*fmt == ')')
8425 --pcount;
8426 else if (*fmt == '(')
8427 ++pcount;
8428 fmt++;
8429 }
8430 keylen = fmt - keystart - 1;
8431 if (fmtcnt < 0 || pcount > 0) {
8432 PyErr_SetString(PyExc_ValueError,
8433 "incomplete format key");
8434 goto onError;
8435 }
8436#if 0
8437 /* keys are converted to strings using UTF-8 and
8438 then looked up since Python uses strings to hold
8439 variables names etc. in its namespaces and we
8440 wouldn't want to break common idioms. */
8441 key = PyUnicode_EncodeUTF8(keystart,
8442 keylen,
8443 NULL);
8444#else
8445 key = PyUnicode_FromUnicode(keystart, keylen);
8446#endif
8447 if (key == NULL)
8448 goto onError;
8449 if (args_owned) {
8450 Py_DECREF(args);
8451 args_owned = 0;
8452 }
8453 args = PyObject_GetItem(dict, key);
8454 Py_DECREF(key);
8455 if (args == NULL) {
8456 goto onError;
8457 }
8458 args_owned = 1;
8459 arglen = -1;
8460 argidx = -2;
8461 }
8462 while (--fmtcnt >= 0) {
8463 switch (c = *fmt++) {
8464 case '-': flags |= F_LJUST; continue;
8465 case '+': flags |= F_SIGN; continue;
8466 case ' ': flags |= F_BLANK; continue;
8467 case '#': flags |= F_ALT; continue;
8468 case '0': flags |= F_ZERO; continue;
8469 }
8470 break;
8471 }
8472 if (c == '*') {
8473 v = getnextarg(args, arglen, &argidx);
8474 if (v == NULL)
8475 goto onError;
8476 if (!PyInt_Check(v)) {
8477 PyErr_SetString(PyExc_TypeError,
8478 "* wants int");
8479 goto onError;
8480 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008481 width = PyInt_AsSsize_t(v);
8482 if (width == -1 && PyErr_Occurred())
8483 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008484 if (width < 0) {
8485 flags |= F_LJUST;
8486 width = -width;
8487 }
8488 if (--fmtcnt >= 0)
8489 c = *fmt++;
8490 }
8491 else if (c >= '0' && c <= '9') {
8492 width = c - '0';
8493 while (--fmtcnt >= 0) {
8494 c = *fmt++;
8495 if (c < '0' || c > '9')
8496 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008497 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008498 PyErr_SetString(PyExc_ValueError,
8499 "width too big");
8500 goto onError;
8501 }
8502 width = width*10 + (c - '0');
8503 }
8504 }
8505 if (c == '.') {
8506 prec = 0;
8507 if (--fmtcnt >= 0)
8508 c = *fmt++;
8509 if (c == '*') {
8510 v = getnextarg(args, arglen, &argidx);
8511 if (v == NULL)
8512 goto onError;
8513 if (!PyInt_Check(v)) {
8514 PyErr_SetString(PyExc_TypeError,
8515 "* wants int");
8516 goto onError;
8517 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008518 prec = _PyInt_AsInt(v);
8519 if (prec == -1 && PyErr_Occurred())
8520 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008521 if (prec < 0)
8522 prec = 0;
8523 if (--fmtcnt >= 0)
8524 c = *fmt++;
8525 }
8526 else if (c >= '0' && c <= '9') {
8527 prec = c - '0';
8528 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008529 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008530 if (c < '0' || c > '9')
8531 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008532 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008533 PyErr_SetString(PyExc_ValueError,
8534 "prec too big");
8535 goto onError;
8536 }
8537 prec = prec*10 + (c - '0');
8538 }
8539 }
8540 } /* prec */
8541 if (fmtcnt >= 0) {
8542 if (c == 'h' || c == 'l' || c == 'L') {
8543 if (--fmtcnt >= 0)
8544 c = *fmt++;
8545 }
8546 }
8547 if (fmtcnt < 0) {
8548 PyErr_SetString(PyExc_ValueError,
8549 "incomplete format");
8550 goto onError;
8551 }
8552 if (c != '%') {
8553 v = getnextarg(args, arglen, &argidx);
8554 if (v == NULL)
8555 goto onError;
8556 }
8557 sign = 0;
8558 fill = ' ';
8559 switch (c) {
8560
8561 case '%':
8562 pbuf = formatbuf;
8563 /* presume that buffer length is at least 1 */
8564 pbuf[0] = '%';
8565 len = 1;
8566 break;
8567
8568 case 's':
8569 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008570 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008571 temp = v;
8572 Py_INCREF(temp);
8573 }
8574 else {
8575 PyObject *unicode;
8576 if (c == 's')
8577 temp = PyObject_Unicode(v);
8578 else
8579 temp = PyObject_Repr(v);
8580 if (temp == NULL)
8581 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008582 if (PyUnicode_Check(temp))
8583 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008584 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008585 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008586 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8587 PyString_GET_SIZE(temp),
8588 NULL,
8589 "strict");
8590 Py_DECREF(temp);
8591 temp = unicode;
8592 if (temp == NULL)
8593 goto onError;
8594 }
8595 else {
8596 Py_DECREF(temp);
8597 PyErr_SetString(PyExc_TypeError,
8598 "%s argument has non-string str()");
8599 goto onError;
8600 }
8601 }
8602 pbuf = PyUnicode_AS_UNICODE(temp);
8603 len = PyUnicode_GET_SIZE(temp);
8604 if (prec >= 0 && len > prec)
8605 len = prec;
8606 break;
8607
8608 case 'i':
8609 case 'd':
8610 case 'u':
8611 case 'o':
8612 case 'x':
8613 case 'X':
8614 if (c == 'i')
8615 c = 'd';
8616 isnumok = 0;
8617 if (PyNumber_Check(v)) {
8618 PyObject *iobj=NULL;
8619
8620 if (PyInt_Check(v) || (PyLong_Check(v))) {
8621 iobj = v;
8622 Py_INCREF(iobj);
8623 }
8624 else {
8625 iobj = PyNumber_Int(v);
8626 if (iobj==NULL) iobj = PyNumber_Long(v);
8627 }
8628 if (iobj!=NULL) {
8629 if (PyInt_Check(iobj)) {
8630 isnumok = 1;
8631 pbuf = formatbuf;
8632 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8633 flags, prec, c, iobj);
8634 Py_DECREF(iobj);
8635 if (len < 0)
8636 goto onError;
8637 sign = 1;
8638 }
8639 else if (PyLong_Check(iobj)) {
8640 isnumok = 1;
8641 temp = formatlong(iobj, flags, prec, c);
8642 Py_DECREF(iobj);
8643 if (!temp)
8644 goto onError;
8645 pbuf = PyUnicode_AS_UNICODE(temp);
8646 len = PyUnicode_GET_SIZE(temp);
8647 sign = 1;
8648 }
8649 else {
8650 Py_DECREF(iobj);
8651 }
8652 }
8653 }
8654 if (!isnumok) {
8655 PyErr_Format(PyExc_TypeError,
8656 "%%%c format: a number is required, "
8657 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8658 goto onError;
8659 }
8660 if (flags & F_ZERO)
8661 fill = '0';
8662 break;
8663
8664 case 'e':
8665 case 'E':
8666 case 'f':
8667 case 'F':
8668 case 'g':
8669 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008670 temp = formatfloat(v, flags, prec, c);
8671 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008672 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008673 pbuf = PyUnicode_AS_UNICODE(temp);
8674 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008675 sign = 1;
8676 if (flags & F_ZERO)
8677 fill = '0';
8678 break;
8679
8680 case 'c':
8681 pbuf = formatbuf;
8682 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8683 if (len < 0)
8684 goto onError;
8685 break;
8686
8687 default:
8688 PyErr_Format(PyExc_ValueError,
8689 "unsupported format character '%c' (0x%x) "
8690 "at index %zd",
8691 (31<=c && c<=126) ? (char)c : '?',
8692 (int)c,
8693 (Py_ssize_t)(fmt - 1 -
8694 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008695 goto onError;
8696 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008697 if (sign) {
8698 if (*pbuf == '-' || *pbuf == '+') {
8699 sign = *pbuf++;
8700 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008701 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008702 else if (flags & F_SIGN)
8703 sign = '+';
8704 else if (flags & F_BLANK)
8705 sign = ' ';
8706 else
8707 sign = 0;
8708 }
8709 if (width < len)
8710 width = len;
8711 if (rescnt - (sign != 0) < width) {
8712 reslen -= rescnt;
8713 rescnt = width + fmtcnt + 100;
8714 reslen += rescnt;
8715 if (reslen < 0) {
8716 Py_XDECREF(temp);
8717 PyErr_NoMemory();
8718 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008719 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008720 if (_PyUnicode_Resize(&result, reslen) < 0) {
8721 Py_XDECREF(temp);
8722 goto onError;
8723 }
8724 res = PyUnicode_AS_UNICODE(result)
8725 + reslen - rescnt;
8726 }
8727 if (sign) {
8728 if (fill != ' ')
8729 *res++ = sign;
8730 rescnt--;
8731 if (width > len)
8732 width--;
8733 }
8734 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8735 assert(pbuf[0] == '0');
8736 assert(pbuf[1] == c);
8737 if (fill != ' ') {
8738 *res++ = *pbuf++;
8739 *res++ = *pbuf++;
8740 }
8741 rescnt -= 2;
8742 width -= 2;
8743 if (width < 0)
8744 width = 0;
8745 len -= 2;
8746 }
8747 if (width > len && !(flags & F_LJUST)) {
8748 do {
8749 --rescnt;
8750 *res++ = fill;
8751 } while (--width > len);
8752 }
8753 if (fill == ' ') {
8754 if (sign)
8755 *res++ = sign;
8756 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8757 assert(pbuf[0] == '0');
8758 assert(pbuf[1] == c);
8759 *res++ = *pbuf++;
8760 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008761 }
8762 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008763 Py_UNICODE_COPY(res, pbuf, len);
8764 res += len;
8765 rescnt -= len;
8766 while (--width >= len) {
8767 --rescnt;
8768 *res++ = ' ';
8769 }
8770 if (dict && (argidx < arglen) && c != '%') {
8771 PyErr_SetString(PyExc_TypeError,
8772 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008773 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008774 goto onError;
8775 }
8776 Py_XDECREF(temp);
8777 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 } /* until end */
8779 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008780 PyErr_SetString(PyExc_TypeError,
8781 "not all arguments converted during string formatting");
8782 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783 }
8784
Thomas Woutersa96affe2006-03-12 00:29:36 +00008785 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008786 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008788 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 }
8790 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 return (PyObject *)result;
8792
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008793 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794 Py_XDECREF(result);
8795 Py_DECREF(uformat);
8796 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008797 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 }
8799 return NULL;
8800}
8801
8802static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008803 (readbufferproc) unicode_buffer_getreadbuf,
8804 (writebufferproc) unicode_buffer_getwritebuf,
8805 (segcountproc) unicode_buffer_getsegcount,
8806 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807};
8808
Jeremy Hylton938ace62002-07-17 16:30:39 +00008809static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008810unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8811
Tim Peters6d6c1a32001-08-02 04:15:00 +00008812static PyObject *
8813unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8814{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008815 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008816 static char *kwlist[] = {"string", "encoding", "errors", 0};
8817 char *encoding = NULL;
8818 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008819
Benjamin Peterson857ce152009-01-31 16:29:18 +00008820 if (type != &PyUnicode_Type)
8821 return unicode_subtype_new(type, args, kwds);
8822 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008823 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008824 return NULL;
8825 if (x == NULL)
8826 return (PyObject *)_PyUnicode_New(0);
8827 if (encoding == NULL && errors == NULL)
8828 return PyObject_Unicode(x);
8829 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008830 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008831}
8832
Guido van Rossume023fe02001-08-30 03:12:59 +00008833static PyObject *
8834unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8835{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008836 PyUnicodeObject *tmp, *pnew;
8837 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008838
Benjamin Peterson857ce152009-01-31 16:29:18 +00008839 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8840 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8841 if (tmp == NULL)
8842 return NULL;
8843 assert(PyUnicode_Check(tmp));
8844 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8845 if (pnew == NULL) {
8846 Py_DECREF(tmp);
8847 return NULL;
8848 }
8849 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8850 if (pnew->str == NULL) {
8851 _Py_ForgetReference((PyObject *)pnew);
8852 PyObject_Del(pnew);
8853 Py_DECREF(tmp);
8854 return PyErr_NoMemory();
8855 }
8856 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8857 pnew->length = n;
8858 pnew->hash = tmp->hash;
8859 Py_DECREF(tmp);
8860 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008861}
8862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008863PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008864 "unicode(object='') -> unicode object\n\
8865unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008866\n\
8867Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008868encoding defaults to the current default string encoding.\n\
8869errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008870
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008872 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008873 "unicode", /* tp_name */
8874 sizeof(PyUnicodeObject), /* tp_size */
8875 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008877 (destructor)unicode_dealloc, /* tp_dealloc */
8878 0, /* tp_print */
8879 0, /* tp_getattr */
8880 0, /* tp_setattr */
8881 0, /* tp_compare */
8882 unicode_repr, /* tp_repr */
8883 &unicode_as_number, /* tp_as_number */
8884 &unicode_as_sequence, /* tp_as_sequence */
8885 &unicode_as_mapping, /* tp_as_mapping */
8886 (hashfunc) unicode_hash, /* tp_hash*/
8887 0, /* tp_call*/
8888 (reprfunc) unicode_str, /* tp_str */
8889 PyObject_GenericGetAttr, /* tp_getattro */
8890 0, /* tp_setattro */
8891 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008892 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008893 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008894 unicode_doc, /* tp_doc */
8895 0, /* tp_traverse */
8896 0, /* tp_clear */
8897 PyUnicode_RichCompare, /* tp_richcompare */
8898 0, /* tp_weaklistoffset */
8899 0, /* tp_iter */
8900 0, /* tp_iternext */
8901 unicode_methods, /* tp_methods */
8902 0, /* tp_members */
8903 0, /* tp_getset */
8904 &PyBaseString_Type, /* tp_base */
8905 0, /* tp_dict */
8906 0, /* tp_descr_get */
8907 0, /* tp_descr_set */
8908 0, /* tp_dictoffset */
8909 0, /* tp_init */
8910 0, /* tp_alloc */
8911 unicode_new, /* tp_new */
8912 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913};
8914
8915/* Initialize the Unicode implementation */
8916
Thomas Wouters78890102000-07-22 19:25:51 +00008917void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008919 /* XXX - move this array to unicodectype.c ? */
8920 Py_UNICODE linebreak[] = {
8921 0x000A, /* LINE FEED */
8922 0x000D, /* CARRIAGE RETURN */
8923 0x001C, /* FILE SEPARATOR */
8924 0x001D, /* GROUP SEPARATOR */
8925 0x001E, /* RECORD SEPARATOR */
8926 0x0085, /* NEXT LINE */
8927 0x2028, /* LINE SEPARATOR */
8928 0x2029, /* PARAGRAPH SEPARATOR */
8929 };
8930
Fred Drakee4315f52000-05-09 19:53:39 +00008931 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008932 if (!unicode_empty) {
8933 unicode_empty = _PyUnicode_New(0);
8934 if (!unicode_empty)
8935 return;
8936 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008937
Guido van Rossumcacfc072002-05-24 19:01:59 +00008938 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008939 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008940
8941 /* initialize the linebreak bloom filter */
8942 bloom_linebreak = make_bloom_mask(
8943 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8944 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008945
8946 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008947
8948 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8949 Py_FatalError("Can't initialize field name iterator type");
8950
8951 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8952 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953}
8954
8955/* Finalize the Unicode implementation */
8956
Christian Heimes3b718a72008-02-14 12:47:33 +00008957int
8958PyUnicode_ClearFreeList(void)
8959{
8960 int freelist_size = numfree;
8961 PyUnicodeObject *u;
8962
8963 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008964 PyUnicodeObject *v = u;
8965 u = *(PyUnicodeObject **)u;
8966 if (v->str)
8967 PyObject_DEL(v->str);
8968 Py_XDECREF(v->defenc);
8969 PyObject_Del(v);
8970 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008971 }
8972 free_list = NULL;
8973 assert(numfree == 0);
8974 return freelist_size;
8975}
8976
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977void
Thomas Wouters78890102000-07-22 19:25:51 +00008978_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008980 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008982 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008983
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008984 for (i = 0; i < 256; i++)
8985 Py_CLEAR(unicode_latin1[i]);
8986
Christian Heimes3b718a72008-02-14 12:47:33 +00008987 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008989
Anthony Baxterac6bd462006-04-13 02:06:09 +00008990#ifdef __cplusplus
8991}
8992#endif