blob: 2e5f5fd848cdd40938679c0443e4b2a7807d9d01 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Serhiy Storchakae822b032013-08-06 16:56:26 +0300550/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551 * by 'ptr', possibly combining surrogate pairs on narrow builds.
552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553 * that should be returned and 'end' pointing to the end of the buffer.
554 * ('end' is used on narrow builds to detect a lone surrogate at the
555 * end of the buffer that should be returned unchanged.)
556 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557 * The type of the returned char is always Py_UCS4.
558 *
559 * Note: the macro advances ptr to next char, so it might have side-effects
560 * (especially if used with other macros).
561 */
562
563/* helper macros used by _Py_UNICODE_NEXT */
564#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566/* Join two surrogate characters and return a single Py_UCS4 value. */
567#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
570
571#ifdef Py_UNICODE_WIDE
572#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
573#else
574#define _Py_UNICODE_NEXT(ptr, end) \
575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
578 (Py_UCS4)*(ptr)++)
579#endif
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581#ifdef HAVE_WCHAR_H
582
Mark Dickinson6b265f12009-03-18 16:07:26 +0000583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594{
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000643 PyErr_BadInternalCall();
644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 }
646
647 unicode = _PyUnicode_New(size);
648 if (!unicode)
649 return NULL;
650
651 /* Copy the wchar_t data into the new object */
652#ifdef HAVE_USABLE_WCHAR_T
653 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000654#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000656 register Py_UNICODE *u;
657 register Py_ssize_t i;
658 u = PyUnicode_AS_UNICODE(unicode);
659 for (i = size; i > 0; i--)
660 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662#endif
663
664 return (PyObject *)unicode;
665}
666
Mark Dickinson6b265f12009-03-18 16:07:26 +0000667#endif /* CONVERT_WCHAR_TO_SURROGATES */
668
669#undef CONVERT_WCHAR_TO_SURROGATES
670
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000671static void
672makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
673{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000674 *fmt++ = '%';
675 if (width) {
676 if (zeropad)
677 *fmt++ = '0';
678 fmt += sprintf(fmt, "%d", width);
679 }
680 if (precision)
681 fmt += sprintf(fmt, ".%d", precision);
682 if (longflag)
683 *fmt++ = 'l';
684 else if (size_tflag) {
685 char *f = PY_FORMAT_SIZE_T;
686 while (*f)
687 *fmt++ = *f++;
688 }
689 *fmt++ = c;
690 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000691}
692
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200693#define appendstring(string) \
694 do { \
695 for (copy = string;*copy; copy++) { \
696 *s++ = (unsigned char)*copy; \
697 } \
698 } while (0)
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000699
700PyObject *
701PyUnicode_FromFormatV(const char *format, va_list vargs)
702{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000703 va_list count;
704 Py_ssize_t callcount = 0;
705 PyObject **callresults = NULL;
706 PyObject **callresult = NULL;
707 Py_ssize_t n = 0;
708 int width = 0;
709 int precision = 0;
710 int zeropad;
711 const char* f;
712 Py_UNICODE *s;
713 PyObject *string;
714 /* used by sprintf */
715 char buffer[21];
716 /* use abuffer instead of buffer, if we need more space
717 * (which can happen if there's a format specifier with width). */
718 char *abuffer = NULL;
719 char *realbuffer;
720 Py_ssize_t abuffersize = 0;
721 char fmt[60]; /* should be enough for %0width.precisionld */
722 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000723
724#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726#else
727#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000728 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000729#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000730 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000731#endif
732#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000733 /* step 1: count the number of %S/%R/%s format specifications
734 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
735 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000736 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000737 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200738 f++;
739 while (*f && *f != '%' && !isalpha((unsigned)*f))
740 f++;
741 if (*f == 's' || *f=='S' || *f=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000742 ++callcount;
743 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000744 }
745 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000746 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000747 if (callcount) {
748 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
749 if (!callresults) {
750 PyErr_NoMemory();
751 return NULL;
752 }
753 callresult = callresults;
754 }
755 /* step 3: figure out how large a buffer we need */
756 for (f = format; *f; f++) {
757 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200758 const char* p = f++;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000759 width = 0;
760 while (isdigit((unsigned)*f))
761 width = (width*10) + *f++ - '0';
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200762 precision = 0;
763 if (*f == '.') {
764 f++;
765 while (isdigit((unsigned)*f))
766 precision = (precision*10) + *f++ - '0';
767 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000768
Benjamin Peterson857ce152009-01-31 16:29:18 +0000769 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
770 * they don't affect the amount of space we reserve.
771 */
772 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000773 (f[1] == 'd' || f[1] == 'u'))
774 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000775
Benjamin Peterson857ce152009-01-31 16:29:18 +0000776 switch (*f) {
777 case 'c':
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300778 {
779 int ordinal = va_arg(count, int);
780#ifdef Py_UNICODE_WIDE
781 if (ordinal < 0 || ordinal > 0x10ffff) {
782 PyErr_SetString(PyExc_OverflowError,
783 "%c arg not in range(0x110000) "
784 "(wide Python build)");
785 goto fail;
786 }
787#else
788 if (ordinal < 0 || ordinal > 0xffff) {
789 PyErr_SetString(PyExc_OverflowError,
790 "%c arg not in range(0x10000) "
791 "(narrow Python build)");
792 goto fail;
793 }
794#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000795 /* fall through... */
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300796 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000797 case '%':
798 n++;
799 break;
800 case 'd': case 'u': case 'i': case 'x':
801 (void) va_arg(count, int);
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200802 if (width < precision)
803 width = precision;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000804 /* 20 bytes is enough to hold a 64-bit
805 integer. Decimal takes the most space.
806 This isn't enough for octal.
807 If a width is specified we need more
808 (which we allocate later). */
809 if (width < 20)
810 width = 20;
811 n += width;
812 if (abuffersize < width)
813 abuffersize = width;
814 break;
815 case 's':
816 {
817 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000818 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000819 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
820 if (!str)
821 goto fail;
822 n += PyUnicode_GET_SIZE(str);
823 /* Remember the str and switch to the next slot */
824 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000825 break;
826 }
827 case 'U':
828 {
829 PyObject *obj = va_arg(count, PyObject *);
830 assert(obj && PyUnicode_Check(obj));
831 n += PyUnicode_GET_SIZE(obj);
832 break;
833 }
834 case 'V':
835 {
836 PyObject *obj = va_arg(count, PyObject *);
837 const char *str = va_arg(count, const char *);
838 assert(obj || str);
839 assert(!obj || PyUnicode_Check(obj));
840 if (obj)
841 n += PyUnicode_GET_SIZE(obj);
842 else
843 n += strlen(str);
844 break;
845 }
846 case 'S':
847 {
848 PyObject *obj = va_arg(count, PyObject *);
849 PyObject *str;
850 assert(obj);
851 str = PyObject_Str(obj);
852 if (!str)
853 goto fail;
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200854 n += PyString_GET_SIZE(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000855 /* Remember the str and switch to the next slot */
856 *callresult++ = str;
857 break;
858 }
859 case 'R':
860 {
861 PyObject *obj = va_arg(count, PyObject *);
862 PyObject *repr;
863 assert(obj);
864 repr = PyObject_Repr(obj);
865 if (!repr)
866 goto fail;
867 n += PyUnicode_GET_SIZE(repr);
868 /* Remember the repr and switch to the next slot */
869 *callresult++ = repr;
870 break;
871 }
872 case 'p':
873 (void) va_arg(count, int);
874 /* maximum 64-bit pointer representation:
875 * 0xffffffffffffffff
876 * so 19 characters is enough.
877 * XXX I count 18 -- what's the extra for?
878 */
879 n += 19;
880 break;
881 default:
882 /* if we stumble upon an unknown
883 formatting code, copy the rest of
884 the format string to the output
885 string. (we cannot just skip the
886 code, since there's no way to know
887 what's in the argument list) */
888 n += strlen(p);
889 goto expand;
890 }
891 } else
892 n++;
893 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000894 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000895 if (abuffersize > 20) {
896 abuffer = PyObject_Malloc(abuffersize);
897 if (!abuffer) {
898 PyErr_NoMemory();
899 goto fail;
900 }
901 realbuffer = abuffer;
902 }
903 else
904 realbuffer = buffer;
905 /* step 4: fill the buffer */
906 /* Since we've analyzed how much space we need for the worst case,
907 we don't have to resize the string.
908 There can be no errors beyond this point. */
909 string = PyUnicode_FromUnicode(NULL, n);
910 if (!string)
911 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000912
Benjamin Peterson857ce152009-01-31 16:29:18 +0000913 s = PyUnicode_AS_UNICODE(string);
914 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000915
Benjamin Peterson857ce152009-01-31 16:29:18 +0000916 for (f = format; *f; f++) {
917 if (*f == '%') {
918 const char* p = f++;
919 int longflag = 0;
920 int size_tflag = 0;
921 zeropad = (*f == '0');
922 /* parse the width.precision part */
923 width = 0;
924 while (isdigit((unsigned)*f))
925 width = (width*10) + *f++ - '0';
926 precision = 0;
927 if (*f == '.') {
928 f++;
929 while (isdigit((unsigned)*f))
930 precision = (precision*10) + *f++ - '0';
931 }
932 /* handle the long flag, but only for %ld and %lu.
933 others can be added when necessary. */
934 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
935 longflag = 1;
936 ++f;
937 }
938 /* handle the size_t flag. */
939 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
940 size_tflag = 1;
941 ++f;
942 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000943
Benjamin Peterson857ce152009-01-31 16:29:18 +0000944 switch (*f) {
945 case 'c':
946 *s++ = va_arg(vargs, int);
947 break;
948 case 'd':
949 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
950 if (longflag)
951 sprintf(realbuffer, fmt, va_arg(vargs, long));
952 else if (size_tflag)
953 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
954 else
955 sprintf(realbuffer, fmt, va_arg(vargs, int));
956 appendstring(realbuffer);
957 break;
958 case 'u':
959 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
960 if (longflag)
961 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
962 else if (size_tflag)
963 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
964 else
965 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
966 appendstring(realbuffer);
967 break;
968 case 'i':
969 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
970 sprintf(realbuffer, fmt, va_arg(vargs, int));
971 appendstring(realbuffer);
972 break;
973 case 'x':
974 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
975 sprintf(realbuffer, fmt, va_arg(vargs, int));
976 appendstring(realbuffer);
977 break;
978 case 's':
979 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000980 /* unused, since we already have the result */
981 (void) va_arg(vargs, char *);
982 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
983 PyUnicode_GET_SIZE(*callresult));
984 s += PyUnicode_GET_SIZE(*callresult);
985 /* We're done with the unicode()/repr() => forget it */
986 Py_DECREF(*callresult);
987 /* switch to next unicode()/repr() result */
988 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000989 break;
990 }
991 case 'U':
992 {
993 PyObject *obj = va_arg(vargs, PyObject *);
994 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
995 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
996 s += size;
997 break;
998 }
999 case 'V':
1000 {
1001 PyObject *obj = va_arg(vargs, PyObject *);
1002 const char *str = va_arg(vargs, const char *);
1003 if (obj) {
1004 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1005 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1006 s += size;
1007 } else {
1008 appendstring(str);
1009 }
1010 break;
1011 }
1012 case 'S':
1013 case 'R':
1014 {
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001015 const char *str = PyString_AS_STRING(*callresult);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001016 /* unused, since we already have the result */
1017 (void) va_arg(vargs, PyObject *);
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001018 appendstring(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001019 /* We're done with the unicode()/repr() => forget it */
1020 Py_DECREF(*callresult);
1021 /* switch to next unicode()/repr() result */
1022 ++callresult;
1023 break;
1024 }
1025 case 'p':
1026 sprintf(buffer, "%p", va_arg(vargs, void*));
1027 /* %p is ill-defined: ensure leading 0x. */
1028 if (buffer[1] == 'X')
1029 buffer[1] = 'x';
1030 else if (buffer[1] != 'x') {
1031 memmove(buffer+2, buffer, strlen(buffer)+1);
1032 buffer[0] = '0';
1033 buffer[1] = 'x';
1034 }
1035 appendstring(buffer);
1036 break;
1037 case '%':
1038 *s++ = '%';
1039 break;
1040 default:
1041 appendstring(p);
1042 goto end;
1043 }
1044 } else
1045 *s++ = *f;
1046 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001047
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001048 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001049 if (callresults)
1050 PyObject_Free(callresults);
1051 if (abuffer)
1052 PyObject_Free(abuffer);
1053 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1054 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001055 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001056 if (callresults) {
1057 PyObject **callresult2 = callresults;
1058 while (callresult2 < callresult) {
1059 Py_DECREF(*callresult2);
1060 ++callresult2;
1061 }
1062 PyObject_Free(callresults);
1063 }
1064 if (abuffer)
1065 PyObject_Free(abuffer);
1066 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001067}
1068
1069#undef appendstring
1070
1071PyObject *
1072PyUnicode_FromFormat(const char *format, ...)
1073{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001074 PyObject* ret;
1075 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001076
1077#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001078 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001079#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001080 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001081#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001082 ret = PyUnicode_FromFormatV(format, vargs);
1083 va_end(vargs);
1084 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001085}
1086
Martin v. Löwis18e16552006-02-15 17:27:45 +00001087Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001088 wchar_t *w,
1089 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090{
1091 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001092 PyErr_BadInternalCall();
1093 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001095
1096 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001098 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001099
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100#ifdef HAVE_USABLE_WCHAR_T
1101 memcpy(w, unicode->str, size * sizeof(wchar_t));
1102#else
1103 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001104 register Py_UNICODE *u;
1105 register Py_ssize_t i;
1106 u = PyUnicode_AS_UNICODE(unicode);
1107 for (i = size; i > 0; i--)
1108 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109 }
1110#endif
1111
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001112 if (size > PyUnicode_GET_SIZE(unicode))
1113 return PyUnicode_GET_SIZE(unicode);
1114 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001115 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116}
1117
1118#endif
1119
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001120PyObject *PyUnicode_FromOrdinal(int ordinal)
1121{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001122 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001123
1124#ifdef Py_UNICODE_WIDE
1125 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001126 PyErr_SetString(PyExc_ValueError,
1127 "unichr() arg not in range(0x110000) "
1128 "(wide Python build)");
1129 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001130 }
1131#else
1132 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001133 PyErr_SetString(PyExc_ValueError,
1134 "unichr() arg not in range(0x10000) "
1135 "(narrow Python build)");
1136 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001137 }
1138#endif
1139
Hye-Shik Chang40574832004-04-06 07:24:51 +00001140 s[0] = (Py_UNICODE)ordinal;
1141 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001142}
1143
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144PyObject *PyUnicode_FromObject(register PyObject *obj)
1145{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001146 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001147 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001148 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001149 Py_INCREF(obj);
1150 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001151 }
1152 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001153 /* For a Unicode subtype that's not a Unicode object,
1154 return a true Unicode object with the same data. */
1155 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1156 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001157 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001158 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1159}
1160
1161PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001162 const char *encoding,
1163 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001164{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001165 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001166 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001167 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001168
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001170 PyErr_BadInternalCall();
1171 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001173
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001174#if 0
1175 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001176 that no encodings is given and then redirect to
1177 PyObject_Unicode() which then applies the additional logic for
1178 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001179
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001180 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001181 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001182
1183 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001184 if (PyUnicode_Check(obj)) {
1185 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001186 PyErr_SetString(PyExc_TypeError,
1187 "decoding Unicode is not supported");
1188 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001189 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001190 return PyObject_Unicode(obj);
1191 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001192#else
1193 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001194 PyErr_SetString(PyExc_TypeError,
1195 "decoding Unicode is not supported");
1196 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001197 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001198#endif
1199
1200 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001201 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001202 s = PyString_AS_STRING(obj);
1203 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001204 }
Christian Heimes3497f942008-05-26 12:29:14 +00001205 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001206 /* Python 2.x specific */
1207 PyErr_Format(PyExc_TypeError,
1208 "decoding bytearray is not supported");
1209 return NULL;
1210 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001211 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001212 /* Overwrite the error message with something more useful in
1213 case of a TypeError. */
1214 if (PyErr_ExceptionMatches(PyExc_TypeError))
1215 PyErr_Format(PyExc_TypeError,
1216 "coercing to Unicode: need string or buffer, "
1217 "%.80s found",
1218 Py_TYPE(obj)->tp_name);
1219 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001220 }
Tim Petersced69f82003-09-16 20:30:58 +00001221
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001222 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001223 if (len == 0)
1224 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001225
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001226 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001227 return v;
1228
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001229 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231}
1232
1233PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001234 Py_ssize_t size,
1235 const char *encoding,
1236 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237{
1238 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001239
1240 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001241 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001242
1243 /* Shortcuts for common default encodings */
1244 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001246 else if (strcmp(encoding, "latin-1") == 0)
1247 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001248#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1249 else if (strcmp(encoding, "mbcs") == 0)
1250 return PyUnicode_DecodeMBCS(s, size, errors);
1251#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001252 else if (strcmp(encoding, "ascii") == 0)
1253 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254
1255 /* Decode via the codec registry */
1256 buffer = PyBuffer_FromMemory((void *)s, size);
1257 if (buffer == NULL)
1258 goto onError;
1259 unicode = PyCodec_Decode(buffer, encoding, errors);
1260 if (unicode == NULL)
1261 goto onError;
1262 if (!PyUnicode_Check(unicode)) {
1263 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001264 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001265 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266 Py_DECREF(unicode);
1267 goto onError;
1268 }
1269 Py_DECREF(buffer);
1270 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001271
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001272 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 Py_XDECREF(buffer);
1274 return NULL;
1275}
1276
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001277PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1278 const char *encoding,
1279 const char *errors)
1280{
1281 PyObject *v;
1282
1283 if (!PyUnicode_Check(unicode)) {
1284 PyErr_BadArgument();
1285 goto onError;
1286 }
1287
1288 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001289 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001290
1291 /* Decode via the codec registry */
1292 v = PyCodec_Decode(unicode, encoding, errors);
1293 if (v == NULL)
1294 goto onError;
1295 return v;
1296
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001297 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001298 return NULL;
1299}
1300
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001302 Py_ssize_t size,
1303 const char *encoding,
1304 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305{
1306 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001307
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 unicode = PyUnicode_FromUnicode(s, size);
1309 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1312 Py_DECREF(unicode);
1313 return v;
1314}
1315
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001316PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1317 const char *encoding,
1318 const char *errors)
1319{
1320 PyObject *v;
1321
1322 if (!PyUnicode_Check(unicode)) {
1323 PyErr_BadArgument();
1324 goto onError;
1325 }
1326
1327 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001328 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001329
1330 /* Encode via the codec registry */
1331 v = PyCodec_Encode(unicode, encoding, errors);
1332 if (v == NULL)
1333 goto onError;
1334 return v;
1335
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001336 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001337 return NULL;
1338}
1339
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1341 const char *encoding,
1342 const char *errors)
1343{
1344 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001345
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346 if (!PyUnicode_Check(unicode)) {
1347 PyErr_BadArgument();
1348 goto onError;
1349 }
Fred Drakee4315f52000-05-09 19:53:39 +00001350
Tim Petersced69f82003-09-16 20:30:58 +00001351 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001352 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001353
1354 /* Shortcuts for common default encodings */
1355 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001356 if (strcmp(encoding, "utf-8") == 0)
1357 return PyUnicode_AsUTF8String(unicode);
1358 else if (strcmp(encoding, "latin-1") == 0)
1359 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001360#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001361 else if (strcmp(encoding, "mbcs") == 0)
1362 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001363#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001364 else if (strcmp(encoding, "ascii") == 0)
1365 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367
1368 /* Encode via the codec registry */
1369 v = PyCodec_Encode(unicode, encoding, errors);
1370 if (v == NULL)
1371 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001372 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001374 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001375 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 Py_DECREF(v);
1377 goto onError;
1378 }
1379 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001380
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001381 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 return NULL;
1383}
1384
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001385PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001386 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001387{
1388 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1389
1390 if (v)
1391 return v;
1392 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1393 if (v && errors == NULL)
1394 ((PyUnicodeObject *)unicode)->defenc = v;
1395 return v;
1396}
1397
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1399{
1400 if (!PyUnicode_Check(unicode)) {
1401 PyErr_BadArgument();
1402 goto onError;
1403 }
1404 return PyUnicode_AS_UNICODE(unicode);
1405
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001406 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001407 return NULL;
1408}
1409
Martin v. Löwis18e16552006-02-15 17:27:45 +00001410Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411{
1412 if (!PyUnicode_Check(unicode)) {
1413 PyErr_BadArgument();
1414 goto onError;
1415 }
1416 return PyUnicode_GET_SIZE(unicode);
1417
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001418 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001419 return -1;
1420}
1421
Thomas Wouters78890102000-07-22 19:25:51 +00001422const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001423{
1424 return unicode_default_encoding;
1425}
1426
1427int PyUnicode_SetDefaultEncoding(const char *encoding)
1428{
1429 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001430
Fred Drakee4315f52000-05-09 19:53:39 +00001431 /* Make sure the encoding is valid. As side effect, this also
1432 loads the encoding into the codec registry cache. */
1433 v = _PyCodec_Lookup(encoding);
1434 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001435 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001436 Py_DECREF(v);
1437 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001438 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001439 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001440 return 0;
1441
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001442 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001443 return -1;
1444}
1445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446/* error handling callback helper:
1447 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001448 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 and adjust various state variables.
1450 return 0 on success, -1 on error
1451*/
1452
1453static
1454int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001455 const char *encoding, const char *reason,
1456 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1457 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1458 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001459{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001460 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001461
1462 PyObject *restuple = NULL;
1463 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001464 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1465 Py_ssize_t requiredsize;
1466 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001467 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001468 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001469 int res = -1;
1470
1471 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001472 *errorHandler = PyCodec_LookupError(errors);
1473 if (*errorHandler == NULL)
1474 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001475 }
1476
1477 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001478 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001479 encoding, input, insize, *startinpos, *endinpos, reason);
1480 if (*exceptionObject == NULL)
1481 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001482 }
1483 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001484 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1485 goto onError;
1486 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1487 goto onError;
1488 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1489 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001490 }
1491
1492 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1493 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001494 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001495 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001496 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001497 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 }
1499 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001500 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001501 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001502 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001503 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001504 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1505 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001506 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507
1508 /* need more space? (at least enough for what we
1509 have+the replacement+the rest of the string (starting
1510 at the new input position), so we won't have to check space
1511 when there are no errors in the rest of the string) */
1512 repptr = PyUnicode_AS_UNICODE(repunicode);
1513 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001514 requiredsize = *outpos;
1515 if (requiredsize > PY_SSIZE_T_MAX - repsize)
1516 goto overflow;
1517 requiredsize += repsize;
1518 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1519 goto overflow;
1520 requiredsize += insize - newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001521 if (requiredsize > outsize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001522 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001523 requiredsize = 2*outsize;
1524 if (_PyUnicode_Resize(output, requiredsize) < 0)
1525 goto onError;
1526 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001527 }
1528 *endinpos = newpos;
1529 *inptr = input + newpos;
1530 Py_UNICODE_COPY(*outptr, repptr, repsize);
1531 *outptr += repsize;
1532 *outpos += repsize;
1533 /* we made it! */
1534 res = 0;
1535
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001536 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001537 Py_XDECREF(restuple);
1538 return res;
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001539
1540 overflow:
1541 PyErr_SetString(PyExc_OverflowError,
1542 "decoded result is too long for a Python string");
1543 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544}
1545
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546/* --- UTF-7 Codec -------------------------------------------------------- */
1547
Antoine Pitrou653dece2009-05-04 18:32:32 +00001548/* See RFC2152 for details. We encode conservatively and decode liberally. */
1549
1550/* Three simple macros defining base-64. */
1551
1552/* Is c a base-64 character? */
1553
1554#define IS_BASE64(c) \
1555 (isalnum(c) || (c) == '+' || (c) == '/')
1556
1557/* given that c is a base-64 character, what is its base-64 value? */
1558
1559#define FROM_BASE64(c) \
1560 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1561 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1562 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1563 (c) == '+' ? 62 : 63)
1564
1565/* What is the base-64 character of the bottom 6 bits of n? */
1566
1567#define TO_BASE64(n) \
1568 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1569
1570/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1571 * decoded as itself. We are permissive on decoding; the only ASCII
1572 * byte not decoding to itself is the + which begins a base64
1573 * string. */
1574
1575#define DECODE_DIRECT(c) \
1576 ((c) <= 127 && (c) != '+')
1577
1578/* The UTF-7 encoder treats ASCII characters differently according to
1579 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1580 * the above). See RFC2152. This array identifies these different
1581 * sets:
1582 * 0 : "Set D"
1583 * alphanumeric and '(),-./:?
1584 * 1 : "Set O"
1585 * !"#$%&*;<=>@[]^_`{|}
1586 * 2 : "whitespace"
1587 * ht nl cr sp
1588 * 3 : special (must be base64 encoded)
1589 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1590 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001591
Tim Petersced69f82003-09-16 20:30:58 +00001592static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001593char utf7_category[128] = {
1594/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1595 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1596/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1597 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1598/* sp ! " # $ % & ' ( ) * + , - . / */
1599 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1600/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1601 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1602/* @ A B C D E F G H I J K L M N O */
1603 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1604/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1605 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1606/* ` a b c d e f g h i j k l m n o */
1607 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1608/* p q r s t u v w x y z { | } ~ del */
1609 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610};
1611
Antoine Pitrou653dece2009-05-04 18:32:32 +00001612/* ENCODE_DIRECT: this character should be encoded as itself. The
1613 * answer depends on whether we are encoding set O as itself, and also
1614 * on whether we are encoding whitespace as itself. RFC2152 makes it
1615 * clear that the answers to these questions vary between
1616 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001617
Antoine Pitrou653dece2009-05-04 18:32:32 +00001618#define ENCODE_DIRECT(c, directO, directWS) \
1619 ((c) < 128 && (c) > 0 && \
1620 ((utf7_category[(c)] == 0) || \
1621 (directWS && (utf7_category[(c)] == 2)) || \
1622 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001623
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001625 Py_ssize_t size,
1626 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001628 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1629}
1630
Antoine Pitrou653dece2009-05-04 18:32:32 +00001631/* The decoder. The only state we preserve is our read position,
1632 * i.e. how many characters we have consumed. So if we end in the
1633 * middle of a shift sequence we have to back off the read position
1634 * and the output to the beginning of the sequence, otherwise we lose
1635 * all the shift state (seen bits, number of bits seen, high
1636 * surrogate). */
1637
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001638PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001639 Py_ssize_t size,
1640 const char *errors,
1641 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001642{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001643 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001644 Py_ssize_t startinpos;
1645 Py_ssize_t endinpos;
1646 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 const char *e;
1648 PyUnicodeObject *unicode;
1649 Py_UNICODE *p;
1650 const char *errmsg = "";
1651 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001652 Py_UNICODE *shiftOutStart;
1653 unsigned int base64bits = 0;
1654 unsigned long base64buffer = 0;
1655 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001656 PyObject *errorHandler = NULL;
1657 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658
1659 unicode = _PyUnicode_New(size);
1660 if (!unicode)
1661 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001662 if (size == 0) {
1663 if (consumed)
1664 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001666 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001667
1668 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001669 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670 e = s + size;
1671
1672 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001673 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001674
Antoine Pitrou653dece2009-05-04 18:32:32 +00001675 if (inShift) { /* in a base-64 section */
1676 if (IS_BASE64(ch)) { /* consume a base-64 character */
1677 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1678 base64bits += 6;
1679 s++;
1680 if (base64bits >= 16) {
1681 /* we have enough bits for a UTF-16 value */
1682 Py_UNICODE outCh = (Py_UNICODE)
1683 (base64buffer >> (base64bits-16));
1684 base64bits -= 16;
1685 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001686 assert(outCh <= 0xffff);
Antoine Pitrou653dece2009-05-04 18:32:32 +00001687 if (surrogate) {
1688 /* expecting a second surrogate */
1689 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1690#ifdef Py_UNICODE_WIDE
1691 *p++ = (((surrogate & 0x3FF)<<10)
1692 | (outCh & 0x3FF)) + 0x10000;
1693#else
1694 *p++ = surrogate;
1695 *p++ = outCh;
1696#endif
1697 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001698 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001699 }
1700 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001701 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001702 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001703 }
1704 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001705 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001706 /* first surrogate */
1707 surrogate = outCh;
1708 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001709 else {
1710 *p++ = outCh;
1711 }
1712 }
1713 }
1714 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001715 inShift = 0;
1716 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001717 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001718 *p++ = surrogate;
1719 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001720 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001721 if (base64bits > 0) { /* left-over bits */
1722 if (base64bits >= 6) {
1723 /* We've seen at least one base-64 character */
1724 errmsg = "partial character in shift sequence";
1725 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001726 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001727 else {
1728 /* Some bits remain; they should be zero */
1729 if (base64buffer != 0) {
1730 errmsg = "non-zero padding bits in shift sequence";
1731 goto utf7Error;
1732 }
1733 }
1734 }
1735 if (ch != '-') {
1736 /* '-' is absorbed; other terminating
1737 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738 *p++ = ch;
1739 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740 }
1741 }
1742 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001743 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001744 s++; /* consume '+' */
1745 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 s++;
1747 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001748 }
1749 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001751 shiftOutStart = p;
1752 base64bits = 0;
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001753 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001754 }
1755 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001756 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001757 *p++ = ch;
1758 s++;
1759 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001760 else {
1761 startinpos = s-starts;
1762 s++;
1763 errmsg = "unexpected special character";
1764 goto utf7Error;
1765 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001766 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001767utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001768 outpos = p-PyUnicode_AS_UNICODE(unicode);
1769 endinpos = s-starts;
1770 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001771 errors, &errorHandler,
1772 "utf7", errmsg,
1773 starts, size, &startinpos, &endinpos, &exc, &s,
1774 &unicode, &outpos, &p))
1775 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 }
1777
Antoine Pitrou653dece2009-05-04 18:32:32 +00001778 /* end of string */
1779
1780 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1781 /* if we're in an inconsistent state, that's an error */
1782 if (surrogate ||
1783 (base64bits >= 6) ||
1784 (base64bits > 0 && base64buffer != 0)) {
1785 outpos = p-PyUnicode_AS_UNICODE(unicode);
1786 endinpos = size;
1787 if (unicode_decode_call_errorhandler(
1788 errors, &errorHandler,
1789 "utf7", "unterminated shift sequence",
1790 starts, size, &startinpos, &endinpos, &exc, &s,
1791 &unicode, &outpos, &p))
1792 goto onError;
1793 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001794 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001795
1796 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001797 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001798 if (inShift) {
1799 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001800 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001801 }
1802 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001803 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001805 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001806
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001807 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001808 goto onError;
1809
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 Py_XDECREF(errorHandler);
1811 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001812 return (PyObject *)unicode;
1813
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001814 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 Py_XDECREF(errorHandler);
1816 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001817 Py_DECREF(unicode);
1818 return NULL;
1819}
1820
1821
1822PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001823 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001824 int base64SetO,
1825 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001826 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001827{
1828 PyObject *v;
1829 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001830 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001831 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001832 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001833 unsigned int base64bits = 0;
1834 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001835 char * out;
1836 char * start;
1837
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001838 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001839 return PyErr_NoMemory();
1840
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001841 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001842 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001843
Antoine Pitrou653dece2009-05-04 18:32:32 +00001844 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001845 if (v == NULL)
1846 return NULL;
1847
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001848 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001849 for (;i < size; ++i) {
1850 Py_UNICODE ch = s[i];
1851
Antoine Pitrou653dece2009-05-04 18:32:32 +00001852 if (inShift) {
1853 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1854 /* shifting out */
1855 if (base64bits) { /* output remaining bits */
1856 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1857 base64buffer = 0;
1858 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001859 }
1860 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001861 /* Characters not in the BASE64 set implicitly unshift the sequence
1862 so no '-' is required, except if the character is itself a '-' */
1863 if (IS_BASE64(ch) || ch == '-') {
1864 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001865 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001866 *out++ = (char) ch;
1867 }
1868 else {
1869 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001870 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001871 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001872 else { /* not in a shift sequence */
1873 if (ch == '+') {
1874 *out++ = '+';
1875 *out++ = '-';
1876 }
1877 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1878 *out++ = (char) ch;
1879 }
1880 else {
1881 *out++ = '+';
1882 inShift = 1;
1883 goto encode_char;
1884 }
1885 }
1886 continue;
1887encode_char:
1888#ifdef Py_UNICODE_WIDE
1889 if (ch >= 0x10000) {
1890 /* code first surrogate */
1891 base64bits += 16;
1892 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1893 while (base64bits >= 6) {
1894 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1895 base64bits -= 6;
1896 }
1897 /* prepare second surrogate */
1898 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1899 }
1900#endif
1901 base64bits += 16;
1902 base64buffer = (base64buffer << 16) | ch;
1903 while (base64bits >= 6) {
1904 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1905 base64bits -= 6;
1906 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001907 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001908 if (base64bits)
1909 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1910 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001911 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001912
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001913 if (_PyString_Resize(&v, out - start))
1914 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001915 return v;
1916}
1917
Antoine Pitrou653dece2009-05-04 18:32:32 +00001918#undef IS_BASE64
1919#undef FROM_BASE64
1920#undef TO_BASE64
1921#undef DECODE_DIRECT
1922#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001923
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924/* --- UTF-8 Codec -------------------------------------------------------- */
1925
Tim Petersced69f82003-09-16 20:30:58 +00001926static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001928 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1929 illegal prefix. See RFC 3629 for details */
1930 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1931 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001932 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1934 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1935 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1936 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001937 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1938 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1940 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001941 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1942 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1943 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1944 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1945 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946};
1947
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001949 Py_ssize_t size,
1950 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951{
Walter Dörwald69652032004-09-07 20:24:22 +00001952 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1953}
1954
1955PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001956 Py_ssize_t size,
1957 const char *errors,
1958 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001959{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001960 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001962 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001963 Py_ssize_t startinpos;
1964 Py_ssize_t endinpos;
1965 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 const char *e;
1967 PyUnicodeObject *unicode;
1968 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001969 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001970 PyObject *errorHandler = NULL;
1971 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972
1973 /* Note: size will always be longer than the resulting Unicode
1974 character count */
1975 unicode = _PyUnicode_New(size);
1976 if (!unicode)
1977 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001978 if (size == 0) {
1979 if (consumed)
1980 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983
1984 /* Unpack UTF-8 encoded data */
1985 p = unicode->str;
1986 e = s + size;
1987
1988 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001989 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990
1991 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001992 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993 s++;
1994 continue;
1995 }
1996
1997 n = utf8_code_length[ch];
1998
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001999 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002000 if (consumed)
2001 break;
2002 else {
2003 errmsg = "unexpected end of data";
2004 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002005 endinpos = startinpos+1;
2006 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2007 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002008 goto utf8Error;
2009 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011
2012 switch (n) {
2013
2014 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002015 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002016 startinpos = s-starts;
2017 endinpos = startinpos+1;
2018 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019
2020 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002021 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002022 startinpos = s-starts;
2023 endinpos = startinpos+1;
2024 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002025
2026 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002027 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00002028 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002029 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002030 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002031 goto utf8Error;
2032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002034 assert ((ch > 0x007F) && (ch <= 0x07FF));
2035 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 break;
2037
2038 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002039 /* XXX: surrogates shouldn't be valid UTF-8!
2040 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2041 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2042 Uncomment the 2 lines below to make them invalid,
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002043 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00002044 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002045 (s[2] & 0xc0) != 0x80 ||
2046 ((unsigned char)s[0] == 0xE0 &&
2047 (unsigned char)s[1] < 0xA0)/* ||
2048 ((unsigned char)s[0] == 0xED &&
2049 (unsigned char)s[1] > 0x9F)*/) {
2050 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002051 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002052 endinpos = startinpos + 1;
2053
2054 /* if s[1] first two bits are 1 and 0, then the invalid
2055 continuation byte is s[2], so increment endinpos by 1,
2056 if not, s[1] is invalid and endinpos doesn't need to
2057 be incremented. */
2058 if ((s[1] & 0xC0) == 0x80)
2059 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002060 goto utf8Error;
2061 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002063 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2064 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002065 break;
2066
2067 case 4:
2068 if ((s[1] & 0xc0) != 0x80 ||
2069 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002070 (s[3] & 0xc0) != 0x80 ||
2071 ((unsigned char)s[0] == 0xF0 &&
2072 (unsigned char)s[1] < 0x90) ||
2073 ((unsigned char)s[0] == 0xF4 &&
2074 (unsigned char)s[1] > 0x8F)) {
2075 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002076 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002077 endinpos = startinpos + 1;
2078 if ((s[1] & 0xC0) == 0x80) {
2079 endinpos++;
2080 if ((s[2] & 0xC0) == 0x80)
2081 endinpos++;
2082 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002083 goto utf8Error;
2084 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002085 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002086 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2087 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2088
Fredrik Lundh8f455852001-06-27 18:59:43 +00002089#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002090 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002091#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002092 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002093
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002094 /* translate from 10000..10FFFF to 0..FFFF */
2095 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002096
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002097 /* high surrogate = top 10 bits added to D800 */
2098 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002099
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002100 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002101 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002102#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 }
2105 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002106 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002107
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002108 utf8Error:
2109 outpos = p-PyUnicode_AS_UNICODE(unicode);
2110 if (unicode_decode_call_errorhandler(
2111 errors, &errorHandler,
2112 "utf8", errmsg,
2113 starts, size, &startinpos, &endinpos, &exc, &s,
2114 &unicode, &outpos, &p))
2115 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 }
Walter Dörwald69652032004-09-07 20:24:22 +00002117 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002118 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119
2120 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002121 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 goto onError;
2123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002124 Py_XDECREF(errorHandler);
2125 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 return (PyObject *)unicode;
2127
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002128 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002129 Py_XDECREF(errorHandler);
2130 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 Py_DECREF(unicode);
2132 return NULL;
2133}
2134
Tim Peters602f7402002-04-27 18:03:26 +00002135/* Allocation strategy: if the string is short, convert into a stack buffer
2136 and allocate exactly as much space needed at the end. Else allocate the
2137 maximum possible needed (4 result bytes per Unicode character), and return
2138 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002139*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002140PyObject *
2141PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002142 Py_ssize_t size,
2143 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144{
Tim Peters602f7402002-04-27 18:03:26 +00002145#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002146
Martin v. Löwis18e16552006-02-15 17:27:45 +00002147 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002148 PyObject *v; /* result string object */
2149 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002150 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002151 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002152 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002153
Tim Peters602f7402002-04-27 18:03:26 +00002154 assert(s != NULL);
2155 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156
Tim Peters602f7402002-04-27 18:03:26 +00002157 if (size <= MAX_SHORT_UNICHARS) {
2158 /* Write into the stack buffer; nallocated can't overflow.
2159 * At the end, we'll allocate exactly as much heap space as it
2160 * turns out we need.
2161 */
2162 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2163 v = NULL; /* will allocate after we're done */
2164 p = stackbuf;
2165 }
2166 else {
2167 /* Overallocate on the heap, and give the excess back at the end. */
2168 nallocated = size * 4;
2169 if (nallocated / 4 != size) /* overflow! */
2170 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002171 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002172 if (v == NULL)
2173 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002174 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002175 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002176
Tim Peters602f7402002-04-27 18:03:26 +00002177 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002178 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002179
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002180 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002181 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002183
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002185 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002186 *p++ = (char)(0xc0 | (ch >> 6));
2187 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002188 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002189 else {
Tim Peters602f7402002-04-27 18:03:26 +00002190 /* Encode UCS2 Unicode ordinals */
2191 if (ch < 0x10000) {
2192 /* Special case: check for high surrogate */
2193 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2194 Py_UCS4 ch2 = s[i];
2195 /* Check for low surrogate and combine the two to
2196 form a UCS4 value */
2197 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002198 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002199 i++;
2200 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002201 }
Tim Peters602f7402002-04-27 18:03:26 +00002202 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002203 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002204 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002205 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2206 *p++ = (char)(0x80 | (ch & 0x3f));
2207 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002208 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002209 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002210 /* Encode UCS4 Unicode ordinals */
2211 *p++ = (char)(0xf0 | (ch >> 18));
2212 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2213 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2214 *p++ = (char)(0x80 | (ch & 0x3f));
2215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002217
Tim Peters602f7402002-04-27 18:03:26 +00002218 if (v == NULL) {
2219 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002220 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002221 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002222 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002223 }
2224 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002225 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002226 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002227 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002228 if (_PyString_Resize(&v, nneeded))
2229 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002232
Tim Peters602f7402002-04-27 18:03:26 +00002233#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234}
2235
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2237{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 if (!PyUnicode_Check(unicode)) {
2239 PyErr_BadArgument();
2240 return NULL;
2241 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002242 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002243 PyUnicode_GET_SIZE(unicode),
2244 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245}
2246
Walter Dörwald6e390802007-08-17 16:41:28 +00002247/* --- UTF-32 Codec ------------------------------------------------------- */
2248
2249PyObject *
2250PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002251 Py_ssize_t size,
2252 const char *errors,
2253 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002254{
2255 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2256}
2257
2258PyObject *
2259PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002260 Py_ssize_t size,
2261 const char *errors,
2262 int *byteorder,
2263 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002264{
2265 const char *starts = s;
2266 Py_ssize_t startinpos;
2267 Py_ssize_t endinpos;
2268 Py_ssize_t outpos;
2269 PyUnicodeObject *unicode;
2270 Py_UNICODE *p;
2271#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002272 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002273 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002274#else
2275 const int pairs = 0;
2276#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002277 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002278 int bo = 0; /* assume native ordering by default */
2279 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002280 /* Offsets from q for retrieving bytes in the right order. */
2281#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2282 int iorder[] = {0, 1, 2, 3};
2283#else
2284 int iorder[] = {3, 2, 1, 0};
2285#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002286 PyObject *errorHandler = NULL;
2287 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002288
Walter Dörwald6e390802007-08-17 16:41:28 +00002289 q = (unsigned char *)s;
2290 e = q + size;
2291
2292 if (byteorder)
2293 bo = *byteorder;
2294
2295 /* Check for BOM marks (U+FEFF) in the input and adjust current
2296 byte order setting accordingly. In native mode, the leading BOM
2297 mark is skipped, in all other modes, it is copied to the output
2298 stream as-is (giving a ZWNBSP character). */
2299 if (bo == 0) {
2300 if (size >= 4) {
2301 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002302 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002303#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002304 if (bom == 0x0000FEFF) {
2305 q += 4;
2306 bo = -1;
2307 }
2308 else if (bom == 0xFFFE0000) {
2309 q += 4;
2310 bo = 1;
2311 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002312#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002313 if (bom == 0x0000FEFF) {
2314 q += 4;
2315 bo = 1;
2316 }
2317 else if (bom == 0xFFFE0000) {
2318 q += 4;
2319 bo = -1;
2320 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002321#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002322 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002323 }
2324
2325 if (bo == -1) {
2326 /* force LE */
2327 iorder[0] = 0;
2328 iorder[1] = 1;
2329 iorder[2] = 2;
2330 iorder[3] = 3;
2331 }
2332 else if (bo == 1) {
2333 /* force BE */
2334 iorder[0] = 3;
2335 iorder[1] = 2;
2336 iorder[2] = 1;
2337 iorder[3] = 0;
2338 }
2339
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002340 /* On narrow builds we split characters outside the BMP into two
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002341 code points => count how much extra space we need. */
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002342#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002343 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002344 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2345 pairs++;
2346#endif
2347
2348 /* This might be one to much, because of a BOM */
2349 unicode = _PyUnicode_New((size+3)/4+pairs);
2350 if (!unicode)
2351 return NULL;
2352 if (size == 0)
2353 return (PyObject *)unicode;
2354
2355 /* Unpack UTF-32 encoded data */
2356 p = unicode->str;
2357
Walter Dörwald6e390802007-08-17 16:41:28 +00002358 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002359 Py_UCS4 ch;
2360 /* remaining bytes at the end? (size should be divisible by 4) */
2361 if (e-q<4) {
2362 if (consumed)
2363 break;
2364 errmsg = "truncated data";
2365 startinpos = ((const char *)q)-starts;
2366 endinpos = ((const char *)e)-starts;
2367 goto utf32Error;
2368 /* The remaining input chars are ignored if the callback
2369 chooses to skip the input */
2370 }
2371 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2372 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002373
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002374 if (ch >= 0x110000)
2375 {
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002376 errmsg = "code point not in range(0x110000)";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002377 startinpos = ((const char *)q)-starts;
2378 endinpos = startinpos+4;
2379 goto utf32Error;
2380 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002381#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002382 if (ch >= 0x10000)
2383 {
2384 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2385 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2386 }
2387 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002388#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002389 *p++ = ch;
2390 q += 4;
2391 continue;
2392 utf32Error:
2393 outpos = p-PyUnicode_AS_UNICODE(unicode);
2394 if (unicode_decode_call_errorhandler(
2395 errors, &errorHandler,
2396 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002397 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002398 &unicode, &outpos, &p))
2399 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002400 }
2401
2402 if (byteorder)
2403 *byteorder = bo;
2404
2405 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002406 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002407
2408 /* Adjust length */
2409 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2410 goto onError;
2411
2412 Py_XDECREF(errorHandler);
2413 Py_XDECREF(exc);
2414 return (PyObject *)unicode;
2415
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002416 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002417 Py_DECREF(unicode);
2418 Py_XDECREF(errorHandler);
2419 Py_XDECREF(exc);
2420 return NULL;
2421}
2422
2423PyObject *
2424PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002425 Py_ssize_t size,
2426 const char *errors,
2427 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002428{
2429 PyObject *v;
2430 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002431 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002432#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002433 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002434#else
2435 const int pairs = 0;
2436#endif
2437 /* Offsets from p for storing byte pairs in the right order. */
2438#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2439 int iorder[] = {0, 1, 2, 3};
2440#else
2441 int iorder[] = {3, 2, 1, 0};
2442#endif
2443
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002444#define STORECHAR(CH) \
2445 do { \
2446 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2447 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2448 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2449 p[iorder[0]] = (CH) & 0xff; \
2450 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002451 } while(0)
2452
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002453 /* In narrow builds we can output surrogate pairs as one code point,
Walter Dörwald6e390802007-08-17 16:41:28 +00002454 so we need less space. */
2455#ifndef Py_UNICODE_WIDE
2456 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002457 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2458 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2459 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002460#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002461 nsize = (size - pairs + (byteorder == 0));
2462 bytesize = nsize * 4;
2463 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002464 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002465 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002466 if (v == NULL)
2467 return NULL;
2468
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002469 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002470 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002471 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002472 if (size == 0)
2473 return v;
2474
2475 if (byteorder == -1) {
2476 /* force LE */
2477 iorder[0] = 0;
2478 iorder[1] = 1;
2479 iorder[2] = 2;
2480 iorder[3] = 3;
2481 }
2482 else if (byteorder == 1) {
2483 /* force BE */
2484 iorder[0] = 3;
2485 iorder[1] = 2;
2486 iorder[2] = 1;
2487 iorder[3] = 0;
2488 }
2489
2490 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002491 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002492#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002493 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2494 Py_UCS4 ch2 = *s;
2495 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2496 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2497 s++;
2498 size--;
2499 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002500 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002501#endif
2502 STORECHAR(ch);
2503 }
2504 return v;
2505#undef STORECHAR
2506}
2507
2508PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2509{
2510 if (!PyUnicode_Check(unicode)) {
2511 PyErr_BadArgument();
2512 return NULL;
2513 }
2514 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002515 PyUnicode_GET_SIZE(unicode),
2516 NULL,
2517 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002518}
2519
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520/* --- UTF-16 Codec ------------------------------------------------------- */
2521
Tim Peters772747b2001-08-09 22:21:55 +00002522PyObject *
2523PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002524 Py_ssize_t size,
2525 const char *errors,
2526 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527{
Walter Dörwald69652032004-09-07 20:24:22 +00002528 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2529}
2530
2531PyObject *
2532PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002533 Py_ssize_t size,
2534 const char *errors,
2535 int *byteorder,
2536 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002539 Py_ssize_t startinpos;
2540 Py_ssize_t endinpos;
2541 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542 PyUnicodeObject *unicode;
2543 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002544 const unsigned char *q, *e;
2545 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002546 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002547 /* Offsets from q for retrieving byte pairs in the right order. */
2548#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2549 int ihi = 1, ilo = 0;
2550#else
2551 int ihi = 0, ilo = 1;
2552#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002553 PyObject *errorHandler = NULL;
2554 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555
2556 /* Note: size will always be longer than the resulting Unicode
2557 character count */
2558 unicode = _PyUnicode_New(size);
2559 if (!unicode)
2560 return NULL;
2561 if (size == 0)
2562 return (PyObject *)unicode;
2563
2564 /* Unpack UTF-16 encoded data */
2565 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002566 q = (unsigned char *)s;
2567 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568
2569 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002570 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002572 /* Check for BOM marks (U+FEFF) in the input and adjust current
2573 byte order setting accordingly. In native mode, the leading BOM
2574 mark is skipped, in all other modes, it is copied to the output
2575 stream as-is (giving a ZWNBSP character). */
2576 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002577 if (size >= 2) {
2578 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002579#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002580 if (bom == 0xFEFF) {
2581 q += 2;
2582 bo = -1;
2583 }
2584 else if (bom == 0xFFFE) {
2585 q += 2;
2586 bo = 1;
2587 }
Tim Petersced69f82003-09-16 20:30:58 +00002588#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002589 if (bom == 0xFEFF) {
2590 q += 2;
2591 bo = 1;
2592 }
2593 else if (bom == 0xFFFE) {
2594 q += 2;
2595 bo = -1;
2596 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002597#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002598 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600
Tim Peters772747b2001-08-09 22:21:55 +00002601 if (bo == -1) {
2602 /* force LE */
2603 ihi = 1;
2604 ilo = 0;
2605 }
2606 else if (bo == 1) {
2607 /* force BE */
2608 ihi = 0;
2609 ilo = 1;
2610 }
2611
2612 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002613 Py_UNICODE ch;
2614 /* remaining bytes at the end? (size should be even) */
2615 if (e-q<2) {
2616 if (consumed)
2617 break;
2618 errmsg = "truncated data";
2619 startinpos = ((const char *)q)-starts;
2620 endinpos = ((const char *)e)-starts;
2621 goto utf16Error;
2622 /* The remaining input chars are ignored if the callback
2623 chooses to skip the input */
2624 }
2625 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002626
Benjamin Peterson857ce152009-01-31 16:29:18 +00002627 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002628
2629 if (ch < 0xD800 || ch > 0xDFFF) {
2630 *p++ = ch;
2631 continue;
2632 }
2633
2634 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002635 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002636 q -= 2;
2637 if (consumed)
2638 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002639 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002640 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002641 endinpos = ((const char *)e)-starts;
2642 goto utf16Error;
2643 }
2644 if (0xD800 <= ch && ch <= 0xDBFF) {
2645 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2646 q += 2;
2647 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002648#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002649 *p++ = ch;
2650 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002651#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002652 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002653#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002654 continue;
2655 }
2656 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002657 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002658 startinpos = (((const char *)q)-4)-starts;
2659 endinpos = startinpos+2;
2660 goto utf16Error;
2661 }
2662
Benjamin Peterson857ce152009-01-31 16:29:18 +00002663 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002664 errmsg = "illegal encoding";
2665 startinpos = (((const char *)q)-2)-starts;
2666 endinpos = startinpos+2;
2667 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002668
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002669 utf16Error:
2670 outpos = p-PyUnicode_AS_UNICODE(unicode);
2671 if (unicode_decode_call_errorhandler(
2672 errors, &errorHandler,
2673 "utf16", errmsg,
2674 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2675 &unicode, &outpos, &p))
2676 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 }
2678
2679 if (byteorder)
2680 *byteorder = bo;
2681
Walter Dörwald69652032004-09-07 20:24:22 +00002682 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002683 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002684
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002686 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 goto onError;
2688
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 Py_XDECREF(errorHandler);
2690 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 return (PyObject *)unicode;
2692
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002693 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002695 Py_XDECREF(errorHandler);
2696 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 return NULL;
2698}
2699
Tim Peters772747b2001-08-09 22:21:55 +00002700PyObject *
2701PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002702 Py_ssize_t size,
2703 const char *errors,
2704 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705{
2706 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002707 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002708 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002709#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002710 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002711#else
2712 const int pairs = 0;
2713#endif
Tim Peters772747b2001-08-09 22:21:55 +00002714 /* Offsets from p for storing byte pairs in the right order. */
2715#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2716 int ihi = 1, ilo = 0;
2717#else
2718 int ihi = 0, ilo = 1;
2719#endif
2720
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002721#define STORECHAR(CH) \
2722 do { \
2723 p[ihi] = ((CH) >> 8) & 0xff; \
2724 p[ilo] = (CH) & 0xff; \
2725 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002726 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002728#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002729 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002730 if (s[i] >= 0x10000)
2731 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002732#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002733 /* 2 * (size + pairs + (byteorder == 0)) */
2734 if (size > PY_SSIZE_T_MAX ||
2735 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002736 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002737 nsize = size + pairs + (byteorder == 0);
2738 bytesize = nsize * 2;
2739 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002740 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002741 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 if (v == NULL)
2743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002745 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002747 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002748 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002749 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002750
2751 if (byteorder == -1) {
2752 /* force LE */
2753 ihi = 1;
2754 ilo = 0;
2755 }
2756 else if (byteorder == 1) {
2757 /* force BE */
2758 ihi = 0;
2759 ilo = 1;
2760 }
2761
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002762 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002763 Py_UNICODE ch = *s++;
2764 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002765#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002766 if (ch >= 0x10000) {
2767 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2768 ch = 0xD800 | ((ch-0x10000) >> 10);
2769 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002770#endif
Tim Peters772747b2001-08-09 22:21:55 +00002771 STORECHAR(ch);
2772 if (ch2)
2773 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002776#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777}
2778
2779PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2780{
2781 if (!PyUnicode_Check(unicode)) {
2782 PyErr_BadArgument();
2783 return NULL;
2784 }
2785 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002786 PyUnicode_GET_SIZE(unicode),
2787 NULL,
2788 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789}
2790
2791/* --- Unicode Escape Codec ----------------------------------------------- */
2792
Fredrik Lundh06d12682001-01-24 07:59:11 +00002793static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002794
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002796 Py_ssize_t size,
2797 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002799 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002800 Py_ssize_t startinpos;
2801 Py_ssize_t endinpos;
2802 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002804 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002806 char* message;
2807 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002808 PyObject *errorHandler = NULL;
2809 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002810
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 /* Escaped strings will always be longer than the resulting
2812 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002813 length after conversion to the true value.
2814 (but if the error callback returns a long replacement string
2815 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 v = _PyUnicode_New(size);
2817 if (v == NULL)
2818 goto onError;
2819 if (size == 0)
2820 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002821
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002824
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 while (s < end) {
2826 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002827 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002828 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829
2830 /* Non-escape characters are interpreted as Unicode ordinals */
2831 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002832 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 continue;
2834 }
2835
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 /* \ - Escapes */
2838 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002839 c = *s++;
2840 if (s > end)
2841 c = '\0'; /* Invalid after \ */
2842 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002844 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 case '\n': break;
2846 case '\\': *p++ = '\\'; break;
2847 case '\'': *p++ = '\''; break;
2848 case '\"': *p++ = '\"'; break;
2849 case 'b': *p++ = '\b'; break;
2850 case 'f': *p++ = '\014'; break; /* FF */
2851 case 't': *p++ = '\t'; break;
2852 case 'n': *p++ = '\n'; break;
2853 case 'r': *p++ = '\r'; break;
2854 case 'v': *p++ = '\013'; break; /* VT */
2855 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2856
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002857 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858 case '0': case '1': case '2': case '3':
2859 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002860 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002861 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002862 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002863 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002864 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002866 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 break;
2868
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002869 /* hex escapes */
2870 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002872 digits = 2;
2873 message = "truncated \\xXX escape";
2874 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002876 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002878 digits = 4;
2879 message = "truncated \\uXXXX escape";
2880 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002882 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002883 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002884 digits = 8;
2885 message = "truncated \\UXXXXXXXX escape";
2886 hexescape:
2887 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002888 if (end - s < digits) {
2889 /* count only hex digits */
2890 for (; s < end; ++s) {
2891 c = (unsigned char)*s;
2892 if (!Py_ISXDIGIT(c))
2893 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002894 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002895 goto error;
2896 }
2897 for (; digits--; ++s) {
2898 c = (unsigned char)*s;
2899 if (!Py_ISXDIGIT(c))
2900 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002901 chr = (chr<<4) & ~0xF;
2902 if (c >= '0' && c <= '9')
2903 chr += c - '0';
2904 else if (c >= 'a' && c <= 'f')
2905 chr += 10 + c - 'a';
2906 else
2907 chr += 10 + c - 'A';
2908 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002909 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910 /* _decoding_error will have already written into the
2911 target buffer. */
2912 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002913 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002914 /* when we get here, chr is a 32-bit unicode character */
2915 if (chr <= 0xffff)
2916 /* UCS-2 character */
2917 *p++ = (Py_UNICODE) chr;
2918 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002919 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002920 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002921#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002922 *p++ = chr;
2923#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002924 chr -= 0x10000L;
2925 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002926 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002927#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002928 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002929 message = "illegal Unicode character";
2930 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002931 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002932 break;
2933
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002934 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002935 case 'N':
2936 message = "malformed \\N character escape";
2937 if (ucnhash_CAPI == NULL) {
2938 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002939 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002940 if (ucnhash_CAPI == NULL)
2941 goto ucnhashError;
2942 }
2943 if (*s == '{') {
2944 const char *start = s+1;
2945 /* look for the closing brace */
2946 while (*s != '}' && s < end)
2947 s++;
2948 if (s > start && s < end && *s == '}') {
2949 /* found a name. look it up in the unicode database */
2950 message = "unknown Unicode character name";
2951 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002952 if (s - start - 1 <= INT_MAX &&
2953 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002954 goto store;
2955 }
2956 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002957 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002958
2959 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002960 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 message = "\\ at end of string";
2962 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002963 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002964 }
2965 else {
2966 *p++ = '\\';
2967 *p++ = (unsigned char)s[-1];
2968 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002969 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002971 continue;
2972
2973 error:
2974 endinpos = s-starts;
2975 outpos = p-PyUnicode_AS_UNICODE(v);
2976 if (unicode_decode_call_errorhandler(
2977 errors, &errorHandler,
2978 "unicodeescape", message,
2979 starts, size, &startinpos, &endinpos, &exc, &s,
2980 &v, &outpos, &p))
2981 goto onError;
2982 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002984 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002986 Py_XDECREF(errorHandler);
2987 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002989
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002990 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002991 PyErr_SetString(
2992 PyExc_UnicodeError,
2993 "\\N escapes not supported (can't load unicodedata module)"
2994 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002995 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 Py_XDECREF(errorHandler);
2997 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002998 return NULL;
2999
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003000 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003002 Py_XDECREF(errorHandler);
3003 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 return NULL;
3005}
3006
3007/* Return a Unicode-Escape string version of the Unicode object.
3008
3009 If quotes is true, the string is enclosed in u"" or u'' quotes as
3010 appropriate.
3011
3012*/
3013
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00003014Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003015 Py_ssize_t size,
3016 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00003017{
3018 /* like wcschr, but doesn't stop at NULL characters */
3019
3020 while (size-- > 0) {
3021 if (*s == ch)
3022 return s;
3023 s++;
3024 }
3025
3026 return NULL;
3027}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003028
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029static
3030PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003031 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 int quotes)
3033{
3034 PyObject *repr;
3035 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003037 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00003038#ifdef Py_UNICODE_WIDE
3039 const Py_ssize_t expandsize = 10;
3040#else
3041 const Py_ssize_t expandsize = 6;
3042#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043
Neal Norwitz17753ec2006-08-21 22:21:19 +00003044 /* XXX(nnorwitz): rather than over-allocating, it would be
3045 better to choose a different scheme. Perhaps scan the
3046 first N-chars of the string and allocate based on that size.
3047 */
3048 /* Initial allocation is based on the longest-possible unichr
3049 escape.
3050
3051 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3052 unichr, so in this case it's the longest unichr escape. In
3053 narrow (UTF-16) builds this is five chars per source unichr
3054 since there are two unichrs in the surrogate pair, so in narrow
3055 (UTF-16) builds it's not the longest unichr escape.
3056
3057 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3058 so in the narrow (UTF-16) build case it's the longest unichr
3059 escape.
3060 */
3061
Neal Norwitze7d8be82008-07-31 17:17:14 +00003062 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003063 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003064
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003065 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003066 2
3067 + expandsize*size
3068 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 if (repr == NULL)
3070 return NULL;
3071
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003072 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073
3074 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003076 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 !findchar(s, size, '"')) ? '"' : '\'';
3078 }
3079 while (size-- > 0) {
3080 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003081
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003082 /* Escape quotes and backslashes */
3083 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003084 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 *p++ = '\\';
3086 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003087 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003088 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003089
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003090#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003091 /* Map 21-bit characters to '\U00xxxxxx' */
3092 else if (ch >= 0x10000) {
3093 *p++ = '\\';
3094 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003095 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3096 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3097 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3098 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3099 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3100 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3101 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003102 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003103 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003104 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003105#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003106 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3107 else if (ch >= 0xD800 && ch < 0xDC00) {
3108 Py_UNICODE ch2;
3109 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003110
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003111 ch2 = *s++;
3112 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003113 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003114 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3115 *p++ = '\\';
3116 *p++ = 'U';
3117 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3118 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3119 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3120 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3121 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3122 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3123 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3124 *p++ = hexdigit[ucs & 0x0000000F];
3125 continue;
3126 }
3127 /* Fall through: isolated surrogates are copied as-is */
3128 s--;
3129 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003130 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003131#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003132
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003134 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135 *p++ = '\\';
3136 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003137 *p++ = hexdigit[(ch >> 12) & 0x000F];
3138 *p++ = hexdigit[(ch >> 8) & 0x000F];
3139 *p++ = hexdigit[(ch >> 4) & 0x000F];
3140 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003142
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003143 /* Map special whitespace to '\t', \n', '\r' */
3144 else if (ch == '\t') {
3145 *p++ = '\\';
3146 *p++ = 't';
3147 }
3148 else if (ch == '\n') {
3149 *p++ = '\\';
3150 *p++ = 'n';
3151 }
3152 else if (ch == '\r') {
3153 *p++ = '\\';
3154 *p++ = 'r';
3155 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003156
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003157 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003158 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003160 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003161 *p++ = hexdigit[(ch >> 4) & 0x000F];
3162 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003163 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003164
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 /* Copy everything else as-is */
3166 else
3167 *p++ = (char) ch;
3168 }
3169 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003170 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171
3172 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003173 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 return repr;
3176}
3177
3178PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003179 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180{
3181 return unicodeescape_string(s, size, 0);
3182}
3183
3184PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3185{
3186 if (!PyUnicode_Check(unicode)) {
3187 PyErr_BadArgument();
3188 return NULL;
3189 }
3190 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003191 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192}
3193
3194/* --- Raw Unicode Escape Codec ------------------------------------------- */
3195
3196PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003197 Py_ssize_t size,
3198 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003200 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003201 Py_ssize_t startinpos;
3202 Py_ssize_t endinpos;
3203 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003205 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 const char *end;
3207 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 PyObject *errorHandler = NULL;
3209 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003210
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 /* Escaped strings will always be longer than the resulting
3212 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003213 length after conversion to the true value. (But decoding error
3214 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 v = _PyUnicode_New(size);
3216 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003217 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003219 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003220 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 end = s + size;
3222 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003223 unsigned char c;
3224 Py_UCS4 x;
3225 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003226 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003228 /* Non-escape characters are interpreted as Unicode ordinals */
3229 if (*s != '\\') {
3230 *p++ = (unsigned char)*s++;
3231 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003232 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003233 startinpos = s-starts;
3234
3235 /* \u-escapes are only interpreted iff the number of leading
3236 backslashes if odd */
3237 bs = s;
3238 for (;s < end;) {
3239 if (*s != '\\')
3240 break;
3241 *p++ = (unsigned char)*s++;
3242 }
3243 if (((s - bs) & 1) == 0 ||
3244 s >= end ||
3245 (*s != 'u' && *s != 'U')) {
3246 continue;
3247 }
3248 p--;
3249 count = *s=='u' ? 4 : 8;
3250 s++;
3251
3252 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3253 outpos = p-PyUnicode_AS_UNICODE(v);
3254 for (x = 0, i = 0; i < count; ++i, ++s) {
3255 c = (unsigned char)*s;
3256 if (!isxdigit(c)) {
3257 endinpos = s-starts;
3258 if (unicode_decode_call_errorhandler(
3259 errors, &errorHandler,
3260 "rawunicodeescape", "truncated \\uXXXX",
3261 starts, size, &startinpos, &endinpos, &exc, &s,
3262 &v, &outpos, &p))
3263 goto onError;
3264 goto nextByte;
3265 }
3266 x = (x<<4) & ~0xF;
3267 if (c >= '0' && c <= '9')
3268 x += c - '0';
3269 else if (c >= 'a' && c <= 'f')
3270 x += 10 + c - 'a';
3271 else
3272 x += 10 + c - 'A';
3273 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003274 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003275 /* UCS-2 character */
3276 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003277 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003278 /* UCS-4 character. Either store directly, or as
3279 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003280#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003281 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003282#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003283 x -= 0x10000L;
3284 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3285 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003286#endif
3287 } else {
3288 endinpos = s-starts;
3289 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003290 if (unicode_decode_call_errorhandler(
3291 errors, &errorHandler,
3292 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003293 starts, size, &startinpos, &endinpos, &exc, &s,
3294 &v, &outpos, &p))
3295 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003296 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003297 nextByte:
3298 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003300 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003301 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 Py_XDECREF(errorHandler);
3303 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003305
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003306 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 Py_XDECREF(errorHandler);
3309 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 return NULL;
3311}
3312
3313PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003314 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315{
3316 PyObject *repr;
3317 char *p;
3318 char *q;
3319
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003320 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003321#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003322 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003323#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003324 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003325#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003326
Neal Norwitze7d8be82008-07-31 17:17:14 +00003327 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003328 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003329
Neal Norwitze7d8be82008-07-31 17:17:14 +00003330 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331 if (repr == NULL)
3332 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003333 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003334 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003336 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 while (size-- > 0) {
3338 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003339#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003340 /* Map 32-bit characters to '\Uxxxxxxxx' */
3341 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003342 *p++ = '\\';
3343 *p++ = 'U';
3344 *p++ = hexdigit[(ch >> 28) & 0xf];
3345 *p++ = hexdigit[(ch >> 24) & 0xf];
3346 *p++ = hexdigit[(ch >> 20) & 0xf];
3347 *p++ = hexdigit[(ch >> 16) & 0xf];
3348 *p++ = hexdigit[(ch >> 12) & 0xf];
3349 *p++ = hexdigit[(ch >> 8) & 0xf];
3350 *p++ = hexdigit[(ch >> 4) & 0xf];
3351 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003352 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003353 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003354#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003355 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3356 if (ch >= 0xD800 && ch < 0xDC00) {
3357 Py_UNICODE ch2;
3358 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003359
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003360 ch2 = *s++;
3361 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003362 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003363 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3364 *p++ = '\\';
3365 *p++ = 'U';
3366 *p++ = hexdigit[(ucs >> 28) & 0xf];
3367 *p++ = hexdigit[(ucs >> 24) & 0xf];
3368 *p++ = hexdigit[(ucs >> 20) & 0xf];
3369 *p++ = hexdigit[(ucs >> 16) & 0xf];
3370 *p++ = hexdigit[(ucs >> 12) & 0xf];
3371 *p++ = hexdigit[(ucs >> 8) & 0xf];
3372 *p++ = hexdigit[(ucs >> 4) & 0xf];
3373 *p++ = hexdigit[ucs & 0xf];
3374 continue;
3375 }
3376 /* Fall through: isolated surrogates are copied as-is */
3377 s--;
3378 size++;
3379 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003380#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003381 /* Map 16-bit characters to '\uxxxx' */
3382 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383 *p++ = '\\';
3384 *p++ = 'u';
3385 *p++ = hexdigit[(ch >> 12) & 0xf];
3386 *p++ = hexdigit[(ch >> 8) & 0xf];
3387 *p++ = hexdigit[(ch >> 4) & 0xf];
3388 *p++ = hexdigit[ch & 15];
3389 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003390 /* Copy everything else as-is */
3391 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 *p++ = (char) ch;
3393 }
3394 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003395 if (_PyString_Resize(&repr, p - q))
3396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 return repr;
3398}
3399
3400PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3401{
3402 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003403 PyErr_BadArgument();
3404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405 }
3406 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003407 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408}
3409
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003410/* --- Unicode Internal Codec ------------------------------------------- */
3411
3412PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003413 Py_ssize_t size,
3414 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003415{
3416 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003417 Py_ssize_t startinpos;
3418 Py_ssize_t endinpos;
3419 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003420 PyUnicodeObject *v;
3421 Py_UNICODE *p;
3422 const char *end;
3423 const char *reason;
3424 PyObject *errorHandler = NULL;
3425 PyObject *exc = NULL;
3426
Neal Norwitzd43069c2006-01-08 01:12:10 +00003427#ifdef Py_UNICODE_WIDE
3428 Py_UNICODE unimax = PyUnicode_GetMax();
3429#endif
3430
Armin Rigo7ccbca92006-10-04 12:17:45 +00003431 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003432 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3433 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003434 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003435 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003436 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003437 p = PyUnicode_AS_UNICODE(v);
3438 end = s + size;
3439
3440 while (s < end) {
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003441 if (end-s < Py_UNICODE_SIZE) {
3442 endinpos = end-starts;
3443 reason = "truncated input";
3444 goto error;
3445 }
Neal Norwitz1004a532006-05-15 07:17:23 +00003446 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003447#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003448 /* We have to sanity check the raw data, otherwise doom looms for
3449 some malformed UCS-4 data. */
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003450 if (*p > unimax || *p < 0) {
3451 endinpos = s - starts + Py_UNICODE_SIZE;
3452 reason = "illegal code point (> 0x10FFFF)";
3453 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003454 }
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003455#endif
3456 p++;
3457 s += Py_UNICODE_SIZE;
3458 continue;
3459
3460 error:
3461 startinpos = s - starts;
3462 outpos = p - PyUnicode_AS_UNICODE(v);
3463 if (unicode_decode_call_errorhandler(
3464 errors, &errorHandler,
3465 "unicode_internal", reason,
3466 starts, size, &startinpos, &endinpos, &exc, &s,
3467 &v, &outpos, &p)) {
3468 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003469 }
3470 }
3471
Martin v. Löwis412fb672006-04-13 06:34:32 +00003472 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003473 goto onError;
3474 Py_XDECREF(errorHandler);
3475 Py_XDECREF(exc);
3476 return (PyObject *)v;
3477
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003478 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003479 Py_XDECREF(v);
3480 Py_XDECREF(errorHandler);
3481 Py_XDECREF(exc);
3482 return NULL;
3483}
3484
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485/* --- Latin-1 Codec ------------------------------------------------------ */
3486
3487PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003488 Py_ssize_t size,
3489 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490{
3491 PyUnicodeObject *v;
3492 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003493
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003495 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003496 Py_UNICODE r = *(unsigned char*)s;
3497 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003498 }
3499
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 v = _PyUnicode_New(size);
3501 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003502 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003504 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505 p = PyUnicode_AS_UNICODE(v);
3506 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003507 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003510 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 Py_XDECREF(v);
3512 return NULL;
3513}
3514
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515/* create or adjust a UnicodeEncodeError */
3516static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003517 const char *encoding,
3518 const Py_UNICODE *unicode, Py_ssize_t size,
3519 Py_ssize_t startpos, Py_ssize_t endpos,
3520 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003523 *exceptionObject = PyUnicodeEncodeError_Create(
3524 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 }
3526 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003527 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3528 goto onError;
3529 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3530 goto onError;
3531 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3532 goto onError;
3533 return;
3534 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02003535 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 }
3537}
3538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539/* raises a UnicodeEncodeError */
3540static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003541 const char *encoding,
3542 const Py_UNICODE *unicode, Py_ssize_t size,
3543 Py_ssize_t startpos, Py_ssize_t endpos,
3544 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545{
3546 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003547 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003549 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550}
3551
3552/* error handling callback helper:
3553 build arguments, call the callback and check the arguments,
3554 put the result into newpos and return the replacement string, which
3555 has to be freed by the caller */
3556static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003557 PyObject **errorHandler,
3558 const char *encoding, const char *reason,
3559 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3560 Py_ssize_t startpos, Py_ssize_t endpos,
3561 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003563 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564
3565 PyObject *restuple;
3566 PyObject *resunicode;
3567
3568 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003569 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003571 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 }
3573
3574 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003575 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003577 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578
3579 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003580 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003582 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003584 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003585 Py_DECREF(restuple);
3586 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 }
3588 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003589 &resunicode, newpos)) {
3590 Py_DECREF(restuple);
3591 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 }
3593 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003594 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003595 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003596 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3597 Py_DECREF(restuple);
3598 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003599 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 Py_INCREF(resunicode);
3601 Py_DECREF(restuple);
3602 return resunicode;
3603}
3604
3605static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003606 Py_ssize_t size,
3607 const char *errors,
3608 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609{
3610 /* output object */
3611 PyObject *res;
3612 /* pointers to the beginning and end+1 of input */
3613 const Py_UNICODE *startp = p;
3614 const Py_UNICODE *endp = p + size;
3615 /* pointer to the beginning of the unencodable characters */
3616 /* const Py_UNICODE *badp = NULL; */
3617 /* pointer into the output */
3618 char *str;
3619 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003620 Py_ssize_t respos = 0;
3621 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003622 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3623 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 PyObject *errorHandler = NULL;
3625 PyObject *exc = NULL;
3626 /* the following variable is used for caching string comparisons
3627 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3628 int known_errorHandler = -1;
3629
3630 /* allocate enough for a simple encoding without
3631 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003632 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 if (res == NULL)
3634 goto onError;
3635 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003636 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003637 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 ressize = size;
3639
3640 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003641 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003643 /* can we encode this? */
3644 if (c<limit) {
3645 /* no overflow check, because we know that the space is enough */
3646 *str++ = (char)c;
3647 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003648 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003649 else {
3650 Py_ssize_t unicodepos = p-startp;
3651 Py_ssize_t requiredsize;
3652 PyObject *repunicode;
3653 Py_ssize_t repsize;
3654 Py_ssize_t newpos;
3655 Py_ssize_t respos;
3656 Py_UNICODE *uni2;
3657 /* startpos for collecting unencodable chars */
3658 const Py_UNICODE *collstart = p;
3659 const Py_UNICODE *collend = p;
3660 /* find all unecodable characters */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003661 while ((collend < endp) && ((*collend) >= limit))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003662 ++collend;
3663 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3664 if (known_errorHandler==-1) {
3665 if ((errors==NULL) || (!strcmp(errors, "strict")))
3666 known_errorHandler = 1;
3667 else if (!strcmp(errors, "replace"))
3668 known_errorHandler = 2;
3669 else if (!strcmp(errors, "ignore"))
3670 known_errorHandler = 3;
3671 else if (!strcmp(errors, "xmlcharrefreplace"))
3672 known_errorHandler = 4;
3673 else
3674 known_errorHandler = 0;
3675 }
3676 switch (known_errorHandler) {
3677 case 1: /* strict */
3678 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3679 goto onError;
3680 case 2: /* replace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003681 while (collstart++ < collend)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003682 *str++ = '?'; /* fall through */
3683 case 3: /* ignore */
3684 p = collend;
3685 break;
3686 case 4: /* xmlcharrefreplace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003687 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003688 /* determine replacement size (temporarily (mis)uses p) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003689 requiredsize = respos;
3690 for (p = collstart; p < collend;) {
Serhiy Storchakae822b032013-08-06 16:56:26 +03003691 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003692 Py_ssize_t incr;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003693 if (ch < 10)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003694 incr = 2+1+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003695 else if (ch < 100)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003696 incr = 2+2+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003697 else if (ch < 1000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003698 incr = 2+3+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003699 else if (ch < 10000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003700 incr = 2+4+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003701 else if (ch < 100000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003702 incr = 2+5+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003703 else if (ch < 1000000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003704 incr = 2+6+1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003705 else
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003706 incr = 2+7+1;
3707 if (requiredsize > PY_SSIZE_T_MAX - incr)
3708 goto overflow;
3709 requiredsize += incr;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003710 }
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003711 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3712 goto overflow;
3713 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003714 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003715 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003716 requiredsize = 2*ressize;
3717 if (_PyString_Resize(&res, requiredsize))
3718 goto onError;
3719 str = PyString_AS_STRING(res) + respos;
3720 ressize = requiredsize;
3721 }
3722 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03003723 for (p = collstart; p < collend;) {
3724 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3725 str += sprintf(str, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003726 }
3727 p = collend;
3728 break;
3729 default:
3730 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3731 encoding, reason, startp, size, &exc,
3732 collstart-startp, collend-startp, &newpos);
3733 if (repunicode == NULL)
3734 goto onError;
3735 /* need more space? (at least enough for what we have+the
3736 replacement+the rest of the string, so we won't have to
3737 check space for encodable characters) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003738 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003739 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003740 if (respos > PY_SSIZE_T_MAX - repsize)
3741 goto overflow;
3742 requiredsize = respos + repsize;
3743 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3744 goto overflow;
3745 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003746 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003747 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003748 requiredsize = 2*ressize;
3749 if (_PyString_Resize(&res, requiredsize)) {
3750 Py_DECREF(repunicode);
3751 goto onError;
3752 }
3753 str = PyString_AS_STRING(res) + respos;
3754 ressize = requiredsize;
3755 }
3756 /* check if there is anything unencodable in the replacement
3757 and copy it to the output */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003758 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003759 c = *uni2;
3760 if (c >= limit) {
3761 raise_encode_exception(&exc, encoding, startp, size,
3762 unicodepos, unicodepos+1, reason);
3763 Py_DECREF(repunicode);
3764 goto onError;
3765 }
3766 *str = (char)c;
3767 }
3768 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003769 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003770 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003771 }
3772 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003773 /* Resize if we allocated to much */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003774 respos = str - PyString_AS_STRING(res);
3775 if (respos < ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003776 /* If this falls res will be NULL */
3777 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 Py_XDECREF(errorHandler);
3779 Py_XDECREF(exc);
3780 return res;
3781
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003782 overflow:
3783 PyErr_SetString(PyExc_OverflowError,
3784 "encoded result is too long for a Python string");
3785
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003786 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 Py_XDECREF(res);
3788 Py_XDECREF(errorHandler);
3789 Py_XDECREF(exc);
3790 return NULL;
3791}
3792
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003794 Py_ssize_t size,
3795 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003797 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798}
3799
3800PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3801{
3802 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003803 PyErr_BadArgument();
3804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 }
3806 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003807 PyUnicode_GET_SIZE(unicode),
3808 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809}
3810
3811/* --- 7-bit ASCII Codec -------------------------------------------------- */
3812
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003814 Py_ssize_t size,
3815 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003817 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 PyUnicodeObject *v;
3819 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003820 Py_ssize_t startinpos;
3821 Py_ssize_t endinpos;
3822 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 const char *e;
3824 PyObject *errorHandler = NULL;
3825 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003826
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003828 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003829 Py_UNICODE r = *(unsigned char*)s;
3830 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003831 }
Tim Petersced69f82003-09-16 20:30:58 +00003832
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833 v = _PyUnicode_New(size);
3834 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003835 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003837 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003839 e = s + size;
3840 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003841 register unsigned char c = (unsigned char)*s;
3842 if (c < 128) {
3843 *p++ = c;
3844 ++s;
3845 }
3846 else {
3847 startinpos = s-starts;
3848 endinpos = startinpos + 1;
3849 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3850 if (unicode_decode_call_errorhandler(
3851 errors, &errorHandler,
3852 "ascii", "ordinal not in range(128)",
3853 starts, size, &startinpos, &endinpos, &exc, &s,
3854 &v, &outpos, &p))
3855 goto onError;
3856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003858 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003859 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3860 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 Py_XDECREF(errorHandler);
3862 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003864
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003865 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003867 Py_XDECREF(errorHandler);
3868 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 return NULL;
3870}
3871
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003873 Py_ssize_t size,
3874 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003876 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877}
3878
3879PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3880{
3881 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003882 PyErr_BadArgument();
3883 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 }
3885 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003886 PyUnicode_GET_SIZE(unicode),
3887 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888}
3889
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003890#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003891
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003892/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003893
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003894#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003895#define NEED_RETRY
3896#endif
3897
3898/* XXX This code is limited to "true" double-byte encodings, as
3899 a) it assumes an incomplete character consists of a single byte, and
3900 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003901 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003902
3903static int is_dbcs_lead_byte(const char *s, int offset)
3904{
3905 const char *curr = s + offset;
3906
3907 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003908 const char *prev = CharPrev(s, curr);
3909 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003910 }
3911 return 0;
3912}
3913
3914/*
3915 * Decode MBCS string into unicode object. If 'final' is set, converts
3916 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3917 */
3918static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003919 const char *s, /* MBCS string */
3920 int size, /* sizeof MBCS string */
3921 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003922{
3923 Py_UNICODE *p;
3924 Py_ssize_t n = 0;
3925 int usize = 0;
3926
3927 assert(size >= 0);
3928
3929 /* Skip trailing lead-byte unless 'final' is set */
3930 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003931 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003932
3933 /* First get the size of the result */
3934 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003935 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3936 if (usize == 0) {
3937 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3938 return -1;
3939 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003940 }
3941
3942 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003943 /* Create unicode object */
3944 *v = _PyUnicode_New(usize);
3945 if (*v == NULL)
3946 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003947 }
3948 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003949 /* Extend unicode object */
3950 n = PyUnicode_GET_SIZE(*v);
3951 if (_PyUnicode_Resize(v, n + usize) < 0)
3952 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003953 }
3954
3955 /* Do the conversion */
3956 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003957 p = PyUnicode_AS_UNICODE(*v) + n;
3958 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3959 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3960 return -1;
3961 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003962 }
3963
3964 return size;
3965}
3966
3967PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003968 Py_ssize_t size,
3969 const char *errors,
3970 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003971{
3972 PyUnicodeObject *v = NULL;
3973 int done;
3974
3975 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003976 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003977
3978#ifdef NEED_RETRY
3979 retry:
3980 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003981 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003982 else
3983#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003984 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003985
3986 if (done < 0) {
3987 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003988 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003989 }
3990
3991 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003992 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003993
3994#ifdef NEED_RETRY
3995 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003996 s += done;
3997 size -= done;
3998 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003999 }
4000#endif
4001
4002 return (PyObject *)v;
4003}
4004
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004005PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004006 Py_ssize_t size,
4007 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004008{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004009 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4010}
4011
4012/*
4013 * Convert unicode into string object (MBCS).
4014 * Returns 0 if succeed, -1 otherwise.
4015 */
4016static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004017 const Py_UNICODE *p, /* unicode */
4018 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004019{
4020 int mbcssize = 0;
4021 Py_ssize_t n = 0;
4022
4023 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004024
4025 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004026 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004027 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4028 if (mbcssize == 0) {
4029 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4030 return -1;
4031 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004032 }
4033
Martin v. Löwisd8251432006-06-14 05:21:04 +00004034 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004035 /* Create string object */
4036 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4037 if (*repr == NULL)
4038 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004039 }
4040 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004041 /* Extend string object */
4042 n = PyString_Size(*repr);
4043 if (_PyString_Resize(repr, n + mbcssize) < 0)
4044 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004045 }
4046
4047 /* Do the conversion */
4048 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004049 char *s = PyString_AS_STRING(*repr) + n;
4050 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4051 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4052 return -1;
4053 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004054 }
4055
4056 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004057}
4058
4059PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004060 Py_ssize_t size,
4061 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004062{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004063 PyObject *repr = NULL;
4064 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004065
Martin v. Löwisd8251432006-06-14 05:21:04 +00004066#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004067 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004068 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004069 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004070 else
4071#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004072 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004073
Martin v. Löwisd8251432006-06-14 05:21:04 +00004074 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004075 Py_XDECREF(repr);
4076 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004077 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004078
4079#ifdef NEED_RETRY
4080 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004081 p += INT_MAX;
4082 size -= INT_MAX;
4083 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004084 }
4085#endif
4086
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004087 return repr;
4088}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004089
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004090PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4091{
4092 if (!PyUnicode_Check(unicode)) {
4093 PyErr_BadArgument();
4094 return NULL;
4095 }
4096 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004097 PyUnicode_GET_SIZE(unicode),
4098 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004099}
4100
Martin v. Löwisd8251432006-06-14 05:21:04 +00004101#undef NEED_RETRY
4102
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004103#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004104
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105/* --- Character Mapping Codec -------------------------------------------- */
4106
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004108 Py_ssize_t size,
4109 PyObject *mapping,
4110 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004113 Py_ssize_t startinpos;
4114 Py_ssize_t endinpos;
4115 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 PyUnicodeObject *v;
4118 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004119 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 PyObject *errorHandler = NULL;
4121 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004122 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004123 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 /* Default to Latin-1 */
4126 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004127 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128
4129 v = _PyUnicode_New(size);
4130 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004131 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004133 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004136 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004137 mapstring = PyUnicode_AS_UNICODE(mapping);
4138 maplen = PyUnicode_GET_SIZE(mapping);
4139 while (s < e) {
4140 unsigned char ch = *s;
4141 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004143 if (ch < maplen)
4144 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004146 if (x == 0xfffe) {
4147 /* undefined mapping */
4148 outpos = p-PyUnicode_AS_UNICODE(v);
4149 startinpos = s-starts;
4150 endinpos = startinpos+1;
4151 if (unicode_decode_call_errorhandler(
4152 errors, &errorHandler,
4153 "charmap", "character maps to <undefined>",
4154 starts, size, &startinpos, &endinpos, &exc, &s,
4155 &v, &outpos, &p)) {
4156 goto onError;
4157 }
4158 continue;
4159 }
4160 *p++ = x;
4161 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004162 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004163 }
4164 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004165 while (s < e) {
4166 unsigned char ch = *s;
4167 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004168
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004169 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4170 w = PyInt_FromLong((long)ch);
4171 if (w == NULL)
4172 goto onError;
4173 x = PyObject_GetItem(mapping, w);
4174 Py_DECREF(w);
4175 if (x == NULL) {
4176 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4177 /* No mapping found means: mapping is undefined. */
4178 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004179 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004180 } else
4181 goto onError;
4182 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004183
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004184 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004185 if (x == Py_None)
4186 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004187 if (PyInt_Check(x)) {
4188 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004189 if (value == 0xFFFE)
4190 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004191 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004192 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004193 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004194 Py_DECREF(x);
4195 goto onError;
4196 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004197
4198#ifndef Py_UNICODE_WIDE
4199 if (value > 0xFFFF) {
4200 /* see the code for 1-n mapping below */
4201 if (extrachars < 2) {
4202 /* resize first */
4203 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4204 Py_ssize_t needed = 10 - extrachars;
4205 extrachars += needed;
4206 /* XXX overflow detection missing */
4207 if (_PyUnicode_Resize(&v,
4208 PyUnicode_GET_SIZE(v) + needed) < 0) {
4209 Py_DECREF(x);
4210 goto onError;
4211 }
4212 p = PyUnicode_AS_UNICODE(v) + oldpos;
4213 }
4214 value -= 0x10000;
4215 *p++ = 0xD800 | (value >> 10);
4216 *p++ = 0xDC00 | (value & 0x3FF);
4217 extrachars -= 2;
4218 }
4219 else
4220#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004221 *p++ = (Py_UNICODE)value;
4222 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004223 else if (PyUnicode_Check(x)) {
4224 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004225
Serhiy Storchaka95997452013-01-15 14:42:59 +02004226 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004227 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004228 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4229 if (value == 0xFFFE)
4230 goto Undefined;
4231 *p++ = value;
4232 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004233 else if (targetsize > 1) {
4234 /* 1-n mapping */
4235 if (targetsize > extrachars) {
4236 /* resize first */
4237 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4238 Py_ssize_t needed = (targetsize - extrachars) + \
4239 (targetsize << 2);
4240 extrachars += needed;
4241 /* XXX overflow detection missing */
4242 if (_PyUnicode_Resize(&v,
4243 PyUnicode_GET_SIZE(v) + needed) < 0) {
4244 Py_DECREF(x);
4245 goto onError;
4246 }
4247 p = PyUnicode_AS_UNICODE(v) + oldpos;
4248 }
4249 Py_UNICODE_COPY(p,
4250 PyUnicode_AS_UNICODE(x),
4251 targetsize);
4252 p += targetsize;
4253 extrachars -= targetsize;
4254 }
4255 /* 1-0 mapping: skip the character */
4256 }
4257 else {
4258 /* wrong return value */
4259 PyErr_SetString(PyExc_TypeError,
4260 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004261 Py_DECREF(x);
4262 goto onError;
4263 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004264 Py_DECREF(x);
4265 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004266 continue;
4267Undefined:
4268 /* undefined mapping */
4269 Py_XDECREF(x);
4270 outpos = p-PyUnicode_AS_UNICODE(v);
4271 startinpos = s-starts;
4272 endinpos = startinpos+1;
4273 if (unicode_decode_call_errorhandler(
4274 errors, &errorHandler,
4275 "charmap", "character maps to <undefined>",
4276 starts, size, &startinpos, &endinpos, &exc, &s,
4277 &v, &outpos, &p)) {
4278 goto onError;
4279 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004280 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281 }
4282 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004283 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4284 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 Py_XDECREF(errorHandler);
4286 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004288
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004289 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 Py_XDECREF(errorHandler);
4291 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 Py_XDECREF(v);
4293 return NULL;
4294}
4295
Martin v. Löwis3f767792006-06-04 19:36:28 +00004296/* Charmap encoding: the lookup table */
4297
4298struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004299 PyObject_HEAD
4300 unsigned char level1[32];
4301 int count2, count3;
4302 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004303};
4304
4305static PyObject*
4306encoding_map_size(PyObject *obj, PyObject* args)
4307{
4308 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004309 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004310 128*map->count3);
4311}
4312
4313static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004314 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004315 PyDoc_STR("Return the size (in bytes) of this object") },
4316 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004317};
4318
4319static void
4320encoding_map_dealloc(PyObject* o)
4321{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004322 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004323}
4324
4325static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004326 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004327 "EncodingMap", /*tp_name*/
4328 sizeof(struct encoding_map), /*tp_basicsize*/
4329 0, /*tp_itemsize*/
4330 /* methods */
4331 encoding_map_dealloc, /*tp_dealloc*/
4332 0, /*tp_print*/
4333 0, /*tp_getattr*/
4334 0, /*tp_setattr*/
4335 0, /*tp_compare*/
4336 0, /*tp_repr*/
4337 0, /*tp_as_number*/
4338 0, /*tp_as_sequence*/
4339 0, /*tp_as_mapping*/
4340 0, /*tp_hash*/
4341 0, /*tp_call*/
4342 0, /*tp_str*/
4343 0, /*tp_getattro*/
4344 0, /*tp_setattro*/
4345 0, /*tp_as_buffer*/
4346 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4347 0, /*tp_doc*/
4348 0, /*tp_traverse*/
4349 0, /*tp_clear*/
4350 0, /*tp_richcompare*/
4351 0, /*tp_weaklistoffset*/
4352 0, /*tp_iter*/
4353 0, /*tp_iternext*/
4354 encoding_map_methods, /*tp_methods*/
4355 0, /*tp_members*/
4356 0, /*tp_getset*/
4357 0, /*tp_base*/
4358 0, /*tp_dict*/
4359 0, /*tp_descr_get*/
4360 0, /*tp_descr_set*/
4361 0, /*tp_dictoffset*/
4362 0, /*tp_init*/
4363 0, /*tp_alloc*/
4364 0, /*tp_new*/
4365 0, /*tp_free*/
4366 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004367};
4368
4369PyObject*
4370PyUnicode_BuildEncodingMap(PyObject* string)
4371{
4372 Py_UNICODE *decode;
4373 PyObject *result;
4374 struct encoding_map *mresult;
4375 int i;
4376 int need_dict = 0;
4377 unsigned char level1[32];
4378 unsigned char level2[512];
4379 unsigned char *mlevel1, *mlevel2, *mlevel3;
4380 int count2 = 0, count3 = 0;
4381
4382 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4383 PyErr_BadArgument();
4384 return NULL;
4385 }
4386 decode = PyUnicode_AS_UNICODE(string);
4387 memset(level1, 0xFF, sizeof level1);
4388 memset(level2, 0xFF, sizeof level2);
4389
4390 /* If there isn't a one-to-one mapping of NULL to \0,
4391 or if there are non-BMP characters, we need to use
4392 a mapping dictionary. */
4393 if (decode[0] != 0)
4394 need_dict = 1;
4395 for (i = 1; i < 256; i++) {
4396 int l1, l2;
4397 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004398#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004399 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004400#endif
4401 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004402 need_dict = 1;
4403 break;
4404 }
4405 if (decode[i] == 0xFFFE)
4406 /* unmapped character */
4407 continue;
4408 l1 = decode[i] >> 11;
4409 l2 = decode[i] >> 7;
4410 if (level1[l1] == 0xFF)
4411 level1[l1] = count2++;
4412 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004413 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004414 }
4415
4416 if (count2 >= 0xFF || count3 >= 0xFF)
4417 need_dict = 1;
4418
4419 if (need_dict) {
4420 PyObject *result = PyDict_New();
4421 PyObject *key, *value;
4422 if (!result)
4423 return NULL;
4424 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004425 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004426 key = PyInt_FromLong(decode[i]);
4427 value = PyInt_FromLong(i);
4428 if (!key || !value)
4429 goto failed1;
4430 if (PyDict_SetItem(result, key, value) == -1)
4431 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004432 Py_DECREF(key);
4433 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004434 }
4435 return result;
4436 failed1:
4437 Py_XDECREF(key);
4438 Py_XDECREF(value);
4439 Py_DECREF(result);
4440 return NULL;
4441 }
4442
4443 /* Create a three-level trie */
4444 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4445 16*count2 + 128*count3 - 1);
4446 if (!result)
4447 return PyErr_NoMemory();
4448 PyObject_Init(result, &EncodingMapType);
4449 mresult = (struct encoding_map*)result;
4450 mresult->count2 = count2;
4451 mresult->count3 = count3;
4452 mlevel1 = mresult->level1;
4453 mlevel2 = mresult->level23;
4454 mlevel3 = mresult->level23 + 16*count2;
4455 memcpy(mlevel1, level1, 32);
4456 memset(mlevel2, 0xFF, 16*count2);
4457 memset(mlevel3, 0, 128*count3);
4458 count3 = 0;
4459 for (i = 1; i < 256; i++) {
4460 int o1, o2, o3, i2, i3;
4461 if (decode[i] == 0xFFFE)
4462 /* unmapped character */
4463 continue;
4464 o1 = decode[i]>>11;
4465 o2 = (decode[i]>>7) & 0xF;
4466 i2 = 16*mlevel1[o1] + o2;
4467 if (mlevel2[i2] == 0xFF)
4468 mlevel2[i2] = count3++;
4469 o3 = decode[i] & 0x7F;
4470 i3 = 128*mlevel2[i2] + o3;
4471 mlevel3[i3] = i;
4472 }
4473 return result;
4474}
4475
4476static int
4477encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4478{
4479 struct encoding_map *map = (struct encoding_map*)mapping;
4480 int l1 = c>>11;
4481 int l2 = (c>>7) & 0xF;
4482 int l3 = c & 0x7F;
4483 int i;
4484
4485#ifdef Py_UNICODE_WIDE
4486 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004487 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004488 }
4489#endif
4490 if (c == 0)
4491 return 0;
4492 /* level 1*/
4493 i = map->level1[l1];
4494 if (i == 0xFF) {
4495 return -1;
4496 }
4497 /* level 2*/
4498 i = map->level23[16*i+l2];
4499 if (i == 0xFF) {
4500 return -1;
4501 }
4502 /* level 3 */
4503 i = map->level23[16*map->count2 + 128*i + l3];
4504 if (i == 0) {
4505 return -1;
4506 }
4507 return i;
4508}
4509
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510/* Lookup the character ch in the mapping. If the character
4511 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004512 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 PyObject *w = PyInt_FromLong((long)c);
4516 PyObject *x;
4517
4518 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004519 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 x = PyObject_GetItem(mapping, w);
4521 Py_DECREF(w);
4522 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004523 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4524 /* No mapping found means: mapping is undefined. */
4525 PyErr_Clear();
4526 x = Py_None;
4527 Py_INCREF(x);
4528 return x;
4529 } else
4530 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004532 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004533 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004535 long value = PyInt_AS_LONG(x);
4536 if (value < 0 || value > 255) {
4537 PyErr_SetString(PyExc_TypeError,
4538 "character mapping must be in range(256)");
4539 Py_DECREF(x);
4540 return NULL;
4541 }
4542 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004544 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004545 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004547 /* wrong return value */
4548 PyErr_SetString(PyExc_TypeError,
4549 "character mapping must return integer, None or str");
4550 Py_DECREF(x);
4551 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552 }
4553}
4554
Martin v. Löwis3f767792006-06-04 19:36:28 +00004555static int
4556charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4557{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004558 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4559 /* exponentially overallocate to minimize reallocations */
4560 if (requiredsize < 2*outsize)
4561 requiredsize = 2*outsize;
4562 if (_PyString_Resize(outobj, requiredsize)) {
4563 return 0;
4564 }
4565 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004566}
4567
Benjamin Peterson857ce152009-01-31 16:29:18 +00004568typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004569 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004570}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571/* lookup the character, put the result in the output string and adjust
4572 various state variables. Reallocate the output string if not enough
4573 space is available. Return a new reference to the object that
4574 was put in the output buffer, or Py_None, if the mapping was undefined
4575 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004576 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004578charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004579 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004581 PyObject *rep;
4582 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004583 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584
Christian Heimese93237d2007-12-19 02:37:44 +00004585 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004586 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004587 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004588 if (res == -1)
4589 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004590 if (outsize<requiredsize)
4591 if (!charmapencode_resize(outobj, outpos, requiredsize))
4592 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004593 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004594 outstart[(*outpos)++] = (char)res;
4595 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004596 }
4597
4598 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004600 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004601 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004602 Py_DECREF(rep);
4603 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004604 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004605 if (PyInt_Check(rep)) {
4606 Py_ssize_t requiredsize = *outpos+1;
4607 if (outsize<requiredsize)
4608 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4609 Py_DECREF(rep);
4610 return enc_EXCEPTION;
4611 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004612 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004613 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004614 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004615 else {
4616 const char *repchars = PyString_AS_STRING(rep);
4617 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4618 Py_ssize_t requiredsize = *outpos+repsize;
4619 if (outsize<requiredsize)
4620 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4621 Py_DECREF(rep);
4622 return enc_EXCEPTION;
4623 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004624 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004625 memcpy(outstart + *outpos, repchars, repsize);
4626 *outpos += repsize;
4627 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628 }
Georg Brandl9f167602006-06-04 21:46:16 +00004629 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004630 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631}
4632
4633/* handle an error in PyUnicode_EncodeCharmap
4634 Return 0 on success, -1 on error */
4635static
4636int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004637 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004639 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004640 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641{
4642 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004643 Py_ssize_t repsize;
4644 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 Py_UNICODE *uni2;
4646 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004647 Py_ssize_t collstartpos = *inpos;
4648 Py_ssize_t collendpos = *inpos+1;
4649 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 char *encoding = "charmap";
4651 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004652 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004654 /* find all unencodable characters */
4655 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004656 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004657 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004658 int res = encoding_map_lookup(p[collendpos], mapping);
4659 if (res != -1)
4660 break;
4661 ++collendpos;
4662 continue;
4663 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004664
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004665 rep = charmapencode_lookup(p[collendpos], mapping);
4666 if (rep==NULL)
4667 return -1;
4668 else if (rep!=Py_None) {
4669 Py_DECREF(rep);
4670 break;
4671 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004672 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004673 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004674 }
4675 /* cache callback name lookup
4676 * (if not done yet, i.e. it's the first error) */
4677 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004678 if ((errors==NULL) || (!strcmp(errors, "strict")))
4679 *known_errorHandler = 1;
4680 else if (!strcmp(errors, "replace"))
4681 *known_errorHandler = 2;
4682 else if (!strcmp(errors, "ignore"))
4683 *known_errorHandler = 3;
4684 else if (!strcmp(errors, "xmlcharrefreplace"))
4685 *known_errorHandler = 4;
4686 else
4687 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688 }
4689 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004690 case 1: /* strict */
4691 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4692 return -1;
4693 case 2: /* replace */
4694 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004695 x = charmapencode_output('?', mapping, res, respos);
4696 if (x==enc_EXCEPTION) {
4697 return -1;
4698 }
4699 else if (x==enc_FAILED) {
4700 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4701 return -1;
4702 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004703 }
4704 /* fall through */
4705 case 3: /* ignore */
4706 *inpos = collendpos;
4707 break;
4708 case 4: /* xmlcharrefreplace */
Serhiy Storchakae822b032013-08-06 16:56:26 +03004709 /* generate replacement */
4710 for (collpos = collstartpos; collpos < collendpos;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004711 char buffer[2+29+1+1];
4712 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03004713 Py_UCS4 ch = p[collpos++];
4714#ifndef Py_UNICODE_WIDE
4715 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4716 (collpos < collendpos) &&
4717 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4718 ch = ((((ch & 0x03FF) << 10) |
4719 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4720 }
4721#endif
4722 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004723 for (cp = buffer; *cp; ++cp) {
4724 x = charmapencode_output(*cp, mapping, res, respos);
4725 if (x==enc_EXCEPTION)
4726 return -1;
4727 else if (x==enc_FAILED) {
4728 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4729 return -1;
4730 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004731 }
4732 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004733 *inpos = collendpos;
4734 break;
4735 default:
4736 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004737 encoding, reason, p, size, exceptionObject,
4738 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004739 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004740 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004741 /* generate replacement */
4742 repsize = PyUnicode_GET_SIZE(repunicode);
4743 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004744 x = charmapencode_output(*uni2, mapping, res, respos);
4745 if (x==enc_EXCEPTION) {
4746 return -1;
4747 }
4748 else if (x==enc_FAILED) {
4749 Py_DECREF(repunicode);
4750 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4751 return -1;
4752 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004753 }
4754 *inpos = newpos;
4755 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004756 }
4757 return 0;
4758}
4759
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004761 Py_ssize_t size,
4762 PyObject *mapping,
4763 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004765 /* output object */
4766 PyObject *res = NULL;
4767 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004768 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004770 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771 PyObject *errorHandler = NULL;
4772 PyObject *exc = NULL;
4773 /* the following variable is used for caching string comparisons
4774 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4775 * 3=ignore, 4=xmlcharrefreplace */
4776 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777
4778 /* Default to Latin-1 */
4779 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004780 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 /* allocate enough for a simple encoding without
4783 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004784 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004785 if (res == NULL)
4786 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004787 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004788 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004791 /* try to encode it */
4792 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4793 if (x==enc_EXCEPTION) /* error */
4794 goto onError;
4795 if (x==enc_FAILED) { /* unencodable character */
4796 if (charmap_encoding_error(p, size, &inpos, mapping,
4797 &exc,
4798 &known_errorHandler, &errorHandler, errors,
4799 &res, &respos)) {
4800 goto onError;
4801 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004802 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004803 else
4804 /* done with this character => adjust input position */
4805 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004809 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004810 if (_PyString_Resize(&res, respos))
4811 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 }
4813 Py_XDECREF(exc);
4814 Py_XDECREF(errorHandler);
4815 return res;
4816
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004817 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 Py_XDECREF(res);
4819 Py_XDECREF(exc);
4820 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821 return NULL;
4822}
4823
4824PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004825 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826{
4827 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004828 PyErr_BadArgument();
4829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 }
4831 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004832 PyUnicode_GET_SIZE(unicode),
4833 mapping,
4834 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835}
4836
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837/* create or adjust a UnicodeTranslateError */
4838static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004839 const Py_UNICODE *unicode, Py_ssize_t size,
4840 Py_ssize_t startpos, Py_ssize_t endpos,
4841 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004844 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004845 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 }
4847 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004848 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4849 goto onError;
4850 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4851 goto onError;
4852 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4853 goto onError;
4854 return;
4855 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02004856 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 }
4858}
4859
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860/* raises a UnicodeTranslateError */
4861static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004862 const Py_UNICODE *unicode, Py_ssize_t size,
4863 Py_ssize_t startpos, Py_ssize_t endpos,
4864 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865{
4866 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004867 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004869 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870}
4871
4872/* error handling callback helper:
4873 build arguments, call the callback and check the arguments,
4874 put the result into newpos and return the replacement string, which
4875 has to be freed by the caller */
4876static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004877 PyObject **errorHandler,
4878 const char *reason,
4879 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4880 Py_ssize_t startpos, Py_ssize_t endpos,
4881 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004883 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004884
Martin v. Löwis412fb672006-04-13 06:34:32 +00004885 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004886 PyObject *restuple;
4887 PyObject *resunicode;
4888
4889 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004890 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004892 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893 }
4894
4895 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004896 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004898 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899
4900 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004901 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004903 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004905 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004906 Py_DECREF(restuple);
4907 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 }
4909 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004910 &resunicode, &i_newpos)) {
4911 Py_DECREF(restuple);
4912 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004913 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004914 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004915 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004916 else
4917 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004918 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004919 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4920 Py_DECREF(restuple);
4921 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004922 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 Py_INCREF(resunicode);
4924 Py_DECREF(restuple);
4925 return resunicode;
4926}
4927
4928/* Lookup the character ch in the mapping and put the result in result,
4929 which must be decrefed by the caller.
4930 Return 0 on success, -1 on error */
4931static
4932int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4933{
4934 PyObject *w = PyInt_FromLong((long)c);
4935 PyObject *x;
4936
4937 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004938 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004939 x = PyObject_GetItem(mapping, w);
4940 Py_DECREF(w);
4941 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004942 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4943 /* No mapping found means: use 1:1 mapping. */
4944 PyErr_Clear();
4945 *result = NULL;
4946 return 0;
4947 } else
4948 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 }
4950 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004951 *result = x;
4952 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004953 }
4954 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004955 long value = PyInt_AS_LONG(x);
4956 long max = PyUnicode_GetMax();
4957 if (value < 0 || value > max) {
4958 PyErr_Format(PyExc_TypeError,
4959 "character mapping must be in range(0x%lx)", max+1);
4960 Py_DECREF(x);
4961 return -1;
4962 }
4963 *result = x;
4964 return 0;
4965 }
4966 else if (PyUnicode_Check(x)) {
4967 *result = x;
4968 return 0;
4969 }
4970 else {
4971 /* wrong return value */
4972 PyErr_SetString(PyExc_TypeError,
4973 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004974 Py_DECREF(x);
4975 return -1;
4976 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004977}
4978/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004979 if not reallocate and adjust various state variables.
4980 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004981static
Walter Dörwald4894c302003-10-24 14:25:28 +00004982int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004983 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004984{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004985 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004986 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004987 /* remember old output position */
4988 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4989 /* exponentially overallocate to minimize reallocations */
4990 if (requiredsize < 2 * oldsize)
4991 requiredsize = 2 * oldsize;
4992 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4993 return -1;
4994 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995 }
4996 return 0;
4997}
4998/* lookup the character, put the result in the output string and adjust
4999 various state variables. Return a new reference to the object that
5000 was put in the output buffer in *result, or Py_None, if the mapping was
5001 undefined (in which case no character was written).
5002 The called must decref result.
5003 Return 0 on success, -1 on error. */
5004static
Walter Dörwald4894c302003-10-24 14:25:28 +00005005int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005006 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5007 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005008{
Walter Dörwald4894c302003-10-24 14:25:28 +00005009 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005010 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005012 /* not found => default to 1:1 mapping */
5013 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005014 }
5015 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005016 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005017 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005018 /* no overflow check, because we know that the space is enough */
5019 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005020 }
5021 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005022 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5023 if (repsize==1) {
5024 /* no overflow check, because we know that the space is enough */
5025 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5026 }
5027 else if (repsize!=0) {
5028 /* more than one character */
5029 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5030 (insize - (curinp-startinp)) +
5031 repsize - 1;
5032 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5033 return -1;
5034 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5035 *outp += repsize;
5036 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005037 }
5038 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005039 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040 return 0;
5041}
5042
5043PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005044 Py_ssize_t size,
5045 PyObject *mapping,
5046 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005048 /* output object */
5049 PyObject *res = NULL;
5050 /* pointers to the beginning and end+1 of input */
5051 const Py_UNICODE *startp = p;
5052 const Py_UNICODE *endp = p + size;
5053 /* pointer into the output */
5054 Py_UNICODE *str;
5055 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005056 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005057 char *reason = "character maps to <undefined>";
5058 PyObject *errorHandler = NULL;
5059 PyObject *exc = NULL;
5060 /* the following variable is used for caching string comparisons
5061 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5062 * 3=ignore, 4=xmlcharrefreplace */
5063 int known_errorHandler = -1;
5064
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005066 PyErr_BadArgument();
5067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005069
5070 /* allocate enough for a simple 1:1 translation without
5071 replacements, if we need more, we'll resize */
5072 res = PyUnicode_FromUnicode(NULL, size);
5073 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005074 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005076 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005077 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005080 /* try to encode it */
5081 PyObject *x = NULL;
5082 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5083 Py_XDECREF(x);
5084 goto onError;
5085 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005086 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005087 if (x!=Py_None) /* it worked => adjust input pointer */
5088 ++p;
5089 else { /* untranslatable character */
5090 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5091 Py_ssize_t repsize;
5092 Py_ssize_t newpos;
5093 Py_UNICODE *uni2;
5094 /* startpos for collecting untranslatable chars */
5095 const Py_UNICODE *collstart = p;
5096 const Py_UNICODE *collend = p+1;
5097 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005099 /* find all untranslatable characters */
5100 while (collend < endp) {
5101 if (charmaptranslate_lookup(*collend, mapping, &x))
5102 goto onError;
5103 Py_XDECREF(x);
5104 if (x!=Py_None)
5105 break;
5106 ++collend;
5107 }
5108 /* cache callback name lookup
5109 * (if not done yet, i.e. it's the first error) */
5110 if (known_errorHandler==-1) {
5111 if ((errors==NULL) || (!strcmp(errors, "strict")))
5112 known_errorHandler = 1;
5113 else if (!strcmp(errors, "replace"))
5114 known_errorHandler = 2;
5115 else if (!strcmp(errors, "ignore"))
5116 known_errorHandler = 3;
5117 else if (!strcmp(errors, "xmlcharrefreplace"))
5118 known_errorHandler = 4;
5119 else
5120 known_errorHandler = 0;
5121 }
5122 switch (known_errorHandler) {
5123 case 1: /* strict */
5124 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005125 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005126 case 2: /* replace */
5127 /* No need to check for space, this is a 1:1 replacement */
5128 for (coll = collstart; coll<collend; ++coll)
5129 *str++ = '?';
5130 /* fall through */
5131 case 3: /* ignore */
5132 p = collend;
5133 break;
5134 case 4: /* xmlcharrefreplace */
5135 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005136 for (p = collstart; p < collend;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005137 char buffer[2+29+1+1];
5138 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03005139 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5140 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005141 if (charmaptranslate_makespace(&res, &str,
5142 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5143 goto onError;
5144 for (cp = buffer; *cp; ++cp)
5145 *str++ = *cp;
5146 }
5147 p = collend;
5148 break;
5149 default:
5150 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5151 reason, startp, size, &exc,
5152 collstart-startp, collend-startp, &newpos);
5153 if (repunicode == NULL)
5154 goto onError;
5155 /* generate replacement */
5156 repsize = PyUnicode_GET_SIZE(repunicode);
5157 if (charmaptranslate_makespace(&res, &str,
5158 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5159 Py_DECREF(repunicode);
5160 goto onError;
5161 }
5162 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5163 *str++ = *uni2;
5164 p = startp + newpos;
5165 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005166 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005167 }
5168 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005169 /* Resize if we allocated to much */
5170 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005171 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005172 if (PyUnicode_Resize(&res, respos) < 0)
5173 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005174 }
5175 Py_XDECREF(exc);
5176 Py_XDECREF(errorHandler);
5177 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005179 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005180 Py_XDECREF(res);
5181 Py_XDECREF(exc);
5182 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 return NULL;
5184}
5185
5186PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005187 PyObject *mapping,
5188 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189{
5190 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005191
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192 str = PyUnicode_FromObject(str);
5193 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005194 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005196 PyUnicode_GET_SIZE(str),
5197 mapping,
5198 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 Py_DECREF(str);
5200 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005201
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005202 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 Py_XDECREF(str);
5204 return NULL;
5205}
Tim Petersced69f82003-09-16 20:30:58 +00005206
Guido van Rossum9e896b32000-04-05 20:11:21 +00005207/* --- Decimal Encoder ---------------------------------------------------- */
5208
5209int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005210 Py_ssize_t length,
5211 char *output,
5212 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005213{
5214 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005215 PyObject *errorHandler = NULL;
5216 PyObject *exc = NULL;
5217 const char *encoding = "decimal";
5218 const char *reason = "invalid decimal Unicode string";
5219 /* the following variable is used for caching string comparisons
5220 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5221 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005222
5223 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005224 PyErr_BadArgument();
5225 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005226 }
5227
5228 p = s;
5229 end = s + length;
5230 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005231 register Py_UNICODE ch = *p;
5232 int decimal;
5233 PyObject *repunicode;
5234 Py_ssize_t repsize;
5235 Py_ssize_t newpos;
5236 Py_UNICODE *uni2;
5237 Py_UNICODE *collstart;
5238 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005239
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005240 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005241 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005242 ++p;
5243 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005244 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005245 decimal = Py_UNICODE_TODECIMAL(ch);
5246 if (decimal >= 0) {
5247 *output++ = '0' + decimal;
5248 ++p;
5249 continue;
5250 }
5251 if (0 < ch && ch < 256) {
5252 *output++ = (char)ch;
5253 ++p;
5254 continue;
5255 }
5256 /* All other characters are considered unencodable */
5257 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005258 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005259 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005260 Py_UNICODE_ISSPACE(*collend) ||
5261 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005262 break;
5263 }
5264 /* cache callback name lookup
5265 * (if not done yet, i.e. it's the first error) */
5266 if (known_errorHandler==-1) {
5267 if ((errors==NULL) || (!strcmp(errors, "strict")))
5268 known_errorHandler = 1;
5269 else if (!strcmp(errors, "replace"))
5270 known_errorHandler = 2;
5271 else if (!strcmp(errors, "ignore"))
5272 known_errorHandler = 3;
5273 else if (!strcmp(errors, "xmlcharrefreplace"))
5274 known_errorHandler = 4;
5275 else
5276 known_errorHandler = 0;
5277 }
5278 switch (known_errorHandler) {
5279 case 1: /* strict */
5280 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5281 goto onError;
5282 case 2: /* replace */
5283 for (p = collstart; p < collend; ++p)
5284 *output++ = '?';
5285 /* fall through */
5286 case 3: /* ignore */
5287 p = collend;
5288 break;
5289 case 4: /* xmlcharrefreplace */
5290 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005291 for (p = collstart; p < collend;) {
5292 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5293 output += sprintf(output, "&#%d;", ch);
5294 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005295 p = collend;
5296 break;
5297 default:
5298 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5299 encoding, reason, s, length, &exc,
5300 collstart-s, collend-s, &newpos);
5301 if (repunicode == NULL)
5302 goto onError;
5303 /* generate replacement */
5304 repsize = PyUnicode_GET_SIZE(repunicode);
5305 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5306 Py_UNICODE ch = *uni2;
5307 if (Py_UNICODE_ISSPACE(ch))
5308 *output++ = ' ';
5309 else {
5310 decimal = Py_UNICODE_TODECIMAL(ch);
5311 if (decimal >= 0)
5312 *output++ = '0' + decimal;
5313 else if (0 < ch && ch < 256)
5314 *output++ = (char)ch;
5315 else {
5316 Py_DECREF(repunicode);
5317 raise_encode_exception(&exc, encoding,
5318 s, length, collstart-s, collend-s, reason);
5319 goto onError;
5320 }
5321 }
5322 }
5323 p = s + newpos;
5324 Py_DECREF(repunicode);
5325 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005326 }
5327 /* 0-terminate the output string */
5328 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005329 Py_XDECREF(exc);
5330 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005331 return 0;
5332
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005333 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005334 Py_XDECREF(exc);
5335 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005336 return -1;
5337}
5338
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339/* --- Helpers ------------------------------------------------------------ */
5340
Eric Smitha9f7d622008-02-17 19:46:49 +00005341#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005342#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005343
5344#include "stringlib/count.h"
5345#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005346#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005347#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005348
Fredrik Lundhc8162812006-05-26 19:33:03 +00005349/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005350#define ADJUST_INDICES(start, end, len) \
5351 if (end > len) \
5352 end = len; \
5353 else if (end < 0) { \
5354 end += len; \
5355 if (end < 0) \
5356 end = 0; \
5357 } \
5358 if (start < 0) { \
5359 start += len; \
5360 if (start < 0) \
5361 start = 0; \
5362 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005363
Martin v. Löwis18e16552006-02-15 17:27:45 +00005364Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005365 PyObject *substr,
5366 Py_ssize_t start,
5367 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005370 PyUnicodeObject* str_obj;
5371 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005372
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005373 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5374 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005375 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005376 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5377 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005378 Py_DECREF(str_obj);
5379 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 }
Tim Petersced69f82003-09-16 20:30:58 +00005381
Antoine Pitrou64672132010-01-13 07:55:48 +00005382 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005383 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005384 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5385 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005386 );
5387
5388 Py_DECREF(sub_obj);
5389 Py_DECREF(str_obj);
5390
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 return result;
5392}
5393
Martin v. Löwis18e16552006-02-15 17:27:45 +00005394Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005395 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005396 Py_ssize_t start,
5397 Py_ssize_t end,
5398 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005400 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005401
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005402 str = PyUnicode_FromObject(str);
5403 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005404 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005405 sub = PyUnicode_FromObject(sub);
5406 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005407 Py_DECREF(str);
5408 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 }
Tim Petersced69f82003-09-16 20:30:58 +00005410
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005411 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005412 result = stringlib_find_slice(
5413 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5414 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5415 start, end
5416 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005417 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005418 result = stringlib_rfind_slice(
5419 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5420 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5421 start, end
5422 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005423
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005424 Py_DECREF(str);
5425 Py_DECREF(sub);
5426
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 return result;
5428}
5429
Tim Petersced69f82003-09-16 20:30:58 +00005430static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005432 PyUnicodeObject *substring,
5433 Py_ssize_t start,
5434 Py_ssize_t end,
5435 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 if (substring->length == 0)
5438 return 1;
5439
Antoine Pitrou64672132010-01-13 07:55:48 +00005440 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 end -= substring->length;
5442 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005443 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444
5445 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005446 if (Py_UNICODE_MATCH(self, end, substring))
5447 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 } else {
5449 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005450 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 }
5452
5453 return 0;
5454}
5455
Martin v. Löwis18e16552006-02-15 17:27:45 +00005456Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005457 PyObject *substr,
5458 Py_ssize_t start,
5459 Py_ssize_t end,
5460 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005462 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005463
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 str = PyUnicode_FromObject(str);
5465 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005466 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 substr = PyUnicode_FromObject(substr);
5468 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005469 Py_DECREF(str);
5470 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 }
Tim Petersced69f82003-09-16 20:30:58 +00005472
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005474 (PyUnicodeObject *)substr,
5475 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 Py_DECREF(str);
5477 Py_DECREF(substr);
5478 return result;
5479}
5480
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481/* Apply fixfct filter to the Unicode object self and return a
5482 reference to the modified object */
5483
Tim Petersced69f82003-09-16 20:30:58 +00005484static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005486 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487{
5488
5489 PyUnicodeObject *u;
5490
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005491 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005493 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005494
5495 Py_UNICODE_COPY(u->str, self->str, self->length);
5496
Tim Peters7a29bd52001-09-12 03:03:31 +00005497 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005498 /* fixfct should return TRUE if it modified the buffer. If
5499 FALSE, return a reference to the original buffer instead
5500 (to save space, not time) */
5501 Py_INCREF(self);
5502 Py_DECREF(u);
5503 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 }
5505 return (PyObject*) u;
5506}
5507
Tim Petersced69f82003-09-16 20:30:58 +00005508static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509int fixupper(PyUnicodeObject *self)
5510{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005511 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 Py_UNICODE *s = self->str;
5513 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005514
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005516 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005517
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005518 ch = Py_UNICODE_TOUPPER(*s);
5519 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005521 *s = ch;
5522 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 s++;
5524 }
5525
5526 return status;
5527}
5528
Tim Petersced69f82003-09-16 20:30:58 +00005529static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530int fixlower(PyUnicodeObject *self)
5531{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005532 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 Py_UNICODE *s = self->str;
5534 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005535
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005537 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005538
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005539 ch = Py_UNICODE_TOLOWER(*s);
5540 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005542 *s = ch;
5543 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 s++;
5545 }
5546
5547 return status;
5548}
5549
Tim Petersced69f82003-09-16 20:30:58 +00005550static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551int fixswapcase(PyUnicodeObject *self)
5552{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005553 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 Py_UNICODE *s = self->str;
5555 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005556
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 while (len-- > 0) {
5558 if (Py_UNICODE_ISUPPER(*s)) {
5559 *s = Py_UNICODE_TOLOWER(*s);
5560 status = 1;
5561 } else if (Py_UNICODE_ISLOWER(*s)) {
5562 *s = Py_UNICODE_TOUPPER(*s);
5563 status = 1;
5564 }
5565 s++;
5566 }
5567
5568 return status;
5569}
5570
Tim Petersced69f82003-09-16 20:30:58 +00005571static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572int fixcapitalize(PyUnicodeObject *self)
5573{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005574 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005575 Py_UNICODE *s = self->str;
5576 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005577
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005578 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005579 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005580 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005581 *s = Py_UNICODE_TOUPPER(*s);
5582 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005584 s++;
5585 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005586 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005587 *s = Py_UNICODE_TOLOWER(*s);
5588 status = 1;
5589 }
5590 s++;
5591 }
5592 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593}
5594
5595static
5596int fixtitle(PyUnicodeObject *self)
5597{
5598 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5599 register Py_UNICODE *e;
5600 int previous_is_cased;
5601
5602 /* Shortcut for single character strings */
5603 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005604 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5605 if (*p != ch) {
5606 *p = ch;
5607 return 1;
5608 }
5609 else
5610 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 }
Tim Petersced69f82003-09-16 20:30:58 +00005612
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 e = p + PyUnicode_GET_SIZE(self);
5614 previous_is_cased = 0;
5615 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005616 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005617
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005618 if (previous_is_cased)
5619 *p = Py_UNICODE_TOLOWER(ch);
5620 else
5621 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005622
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005623 if (Py_UNICODE_ISLOWER(ch) ||
5624 Py_UNICODE_ISUPPER(ch) ||
5625 Py_UNICODE_ISTITLE(ch))
5626 previous_is_cased = 1;
5627 else
5628 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 }
5630 return 1;
5631}
5632
Tim Peters8ce9f162004-08-27 01:49:32 +00005633PyObject *
5634PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635{
Tim Peters8ce9f162004-08-27 01:49:32 +00005636 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005637 const Py_UNICODE blank = ' ';
5638 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005639 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005640 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005641 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5642 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005643 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5644 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005645 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005646 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005647 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648
Benjamin Peterson7a91bf82014-02-15 13:02:52 -05005649 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00005650 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005651 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005652 }
5653
Tim Peters91879ab2004-08-27 22:35:44 +00005654 /* Grrrr. A codec may be invoked to convert str objects to
5655 * Unicode, and so it's possible to call back into Python code
5656 * during PyUnicode_FromObject(), and so it's possible for a sick
5657 * codec to change the size of fseq (if seq is a list). Therefore
5658 * we have to keep refetching the size -- can't assume seqlen
5659 * is invariant.
5660 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005661 seqlen = PySequence_Fast_GET_SIZE(fseq);
5662 /* If empty sequence, return u"". */
5663 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005664 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5665 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005666 }
5667 /* If singleton sequence with an exact Unicode, return that. */
5668 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005669 item = PySequence_Fast_GET_ITEM(fseq, 0);
5670 if (PyUnicode_CheckExact(item)) {
5671 Py_INCREF(item);
5672 res = (PyUnicodeObject *)item;
5673 goto Done;
5674 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005675 }
5676
Tim Peters05eba1f2004-08-27 21:32:02 +00005677 /* At least two items to join, or one that isn't exact Unicode. */
5678 if (seqlen > 1) {
5679 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005680 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005681 sep = &blank;
5682 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005683 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005684 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005685 internal_separator = PyUnicode_FromObject(separator);
5686 if (internal_separator == NULL)
5687 goto onError;
5688 sep = PyUnicode_AS_UNICODE(internal_separator);
5689 seplen = PyUnicode_GET_SIZE(internal_separator);
5690 /* In case PyUnicode_FromObject() mutated seq. */
5691 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005692 }
5693 }
5694
5695 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005696 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005697 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005698 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005699 res_p = PyUnicode_AS_UNICODE(res);
5700 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005701
Tim Peters05eba1f2004-08-27 21:32:02 +00005702 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005703 Py_ssize_t itemlen;
5704 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005705
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005706 item = PySequence_Fast_GET_ITEM(fseq, i);
5707 /* Convert item to Unicode. */
5708 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5709 PyErr_Format(PyExc_TypeError,
5710 "sequence item %zd: expected string or Unicode,"
5711 " %.80s found",
5712 i, Py_TYPE(item)->tp_name);
5713 goto onError;
5714 }
5715 item = PyUnicode_FromObject(item);
5716 if (item == NULL)
5717 goto onError;
5718 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005719
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005720 /* In case PyUnicode_FromObject() mutated seq. */
5721 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005722
Tim Peters8ce9f162004-08-27 01:49:32 +00005723 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005724 itemlen = PyUnicode_GET_SIZE(item);
5725 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005726 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005727 goto Overflow;
5728 if (i < seqlen - 1) {
5729 new_res_used += seplen;
5730 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005731 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005732 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005733 if (new_res_used > res_alloc) {
5734 /* double allocated size until it's big enough */
5735 do {
5736 res_alloc += res_alloc;
5737 if (res_alloc <= 0)
5738 goto Overflow;
5739 } while (new_res_used > res_alloc);
5740 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5741 Py_DECREF(item);
5742 goto onError;
5743 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005744 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005745 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005746
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005747 /* Copy item, and maybe the separator. */
5748 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5749 res_p += itemlen;
5750 if (i < seqlen - 1) {
5751 Py_UNICODE_COPY(res_p, sep, seplen);
5752 res_p += seplen;
5753 }
5754 Py_DECREF(item);
5755 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005756 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005757
Tim Peters05eba1f2004-08-27 21:32:02 +00005758 /* Shrink res to match the used area; this probably can't fail,
5759 * but it's cheap to check.
5760 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005761 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005762 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005763
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005764 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005765 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005766 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 return (PyObject *)res;
5768
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005769 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005770 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005771 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005772 Py_DECREF(item);
5773 /* fall through */
5774
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005775 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005776 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005777 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005778 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 return NULL;
5780}
5781
Tim Petersced69f82003-09-16 20:30:58 +00005782static
5783PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005784 Py_ssize_t left,
5785 Py_ssize_t right,
5786 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787{
5788 PyUnicodeObject *u;
5789
5790 if (left < 0)
5791 left = 0;
5792 if (right < 0)
5793 right = 0;
5794
Tim Peters7a29bd52001-09-12 03:03:31 +00005795 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 Py_INCREF(self);
5797 return self;
5798 }
5799
Neal Norwitze7d8be82008-07-31 17:17:14 +00005800 if (left > PY_SSIZE_T_MAX - self->length ||
5801 right > PY_SSIZE_T_MAX - (left + self->length)) {
5802 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5803 return NULL;
5804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 u = _PyUnicode_New(left + self->length + right);
5806 if (u) {
5807 if (left)
5808 Py_UNICODE_FILL(u->str, fill, left);
5809 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5810 if (right)
5811 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5812 }
5813
5814 return u;
5815}
5816
Antoine Pitrou64672132010-01-13 07:55:48 +00005817PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820
5821 string = PyUnicode_FromObject(string);
5822 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824
Antoine Pitrou64672132010-01-13 07:55:48 +00005825 list = stringlib_splitlines(
5826 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5827 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828
5829 Py_DECREF(string);
5830 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831}
5832
Tim Petersced69f82003-09-16 20:30:58 +00005833static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005835 PyUnicodeObject *substring,
5836 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005839 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005842 return stringlib_split_whitespace(
5843 (PyObject*) self, self->str, self->length, maxcount
5844 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845
Antoine Pitrou64672132010-01-13 07:55:48 +00005846 return stringlib_split(
5847 (PyObject*) self, self->str, self->length,
5848 substring->str, substring->length,
5849 maxcount
5850 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851}
5852
Tim Petersced69f82003-09-16 20:30:58 +00005853static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005854PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005855 PyUnicodeObject *substring,
5856 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005857{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005858 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005859 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005860
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005861 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005862 return stringlib_rsplit_whitespace(
5863 (PyObject*) self, self->str, self->length, maxcount
5864 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005865
Antoine Pitrou64672132010-01-13 07:55:48 +00005866 return stringlib_rsplit(
5867 (PyObject*) self, self->str, self->length,
5868 substring->str, substring->length,
5869 maxcount
5870 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005871}
5872
5873static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005875 PyUnicodeObject *str1,
5876 PyUnicodeObject *str2,
5877 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878{
5879 PyUnicodeObject *u;
5880
5881 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005882 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005883 else if (maxcount == 0 || self->length == 0)
5884 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885
Fredrik Lundh347ee272006-05-24 16:35:18 +00005886 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005887 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005888 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005889 if (str1->length == 0)
5890 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005891 if (str1->length == 1) {
5892 /* replace characters */
5893 Py_UNICODE u1, u2;
5894 if (!findchar(self->str, self->length, str1->str[0]))
5895 goto nothing;
5896 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5897 if (!u)
5898 return NULL;
5899 Py_UNICODE_COPY(u->str, self->str, self->length);
5900 u1 = str1->str[0];
5901 u2 = str2->str[0];
5902 for (i = 0; i < u->length; i++)
5903 if (u->str[i] == u1) {
5904 if (--maxcount < 0)
5905 break;
5906 u->str[i] = u2;
5907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005909 i = stringlib_find(
5910 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005912 if (i < 0)
5913 goto nothing;
5914 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5915 if (!u)
5916 return NULL;
5917 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005918
5919 /* change everything in-place, starting with this one */
5920 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5921 i += str1->length;
5922
5923 while ( --maxcount > 0) {
5924 i = stringlib_find(self->str+i, self->length-i,
5925 str1->str, str1->length,
5926 i);
5927 if (i == -1)
5928 break;
5929 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5930 i += str1->length;
5931 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005934
Brett Cannona7f13ee2010-05-04 01:16:51 +00005935 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005936 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 Py_UNICODE *p;
5938
5939 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005940 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5941 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005942 if (n == 0)
5943 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005944 /* new_size = self->length + n * (str2->length - str1->length)); */
5945 delta = (str2->length - str1->length);
5946 if (delta == 0) {
5947 new_size = self->length;
5948 } else {
5949 product = n * (str2->length - str1->length);
5950 if ((product / (str2->length - str1->length)) != n) {
5951 PyErr_SetString(PyExc_OverflowError,
5952 "replace string is too long");
5953 return NULL;
5954 }
5955 new_size = self->length + product;
5956 if (new_size < 0) {
5957 PyErr_SetString(PyExc_OverflowError,
5958 "replace string is too long");
5959 return NULL;
5960 }
5961 }
5962 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005963 if (!u)
5964 return NULL;
5965 i = 0;
5966 p = u->str;
5967 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005968 while (n-- > 0) {
5969 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005970 j = stringlib_find(self->str+i, self->length-i,
5971 str1->str, str1->length,
5972 i);
5973 if (j == -1)
5974 break;
5975 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005976 /* copy unchanged part [i:j] */
5977 Py_UNICODE_COPY(p, self->str+i, j-i);
5978 p += j - i;
5979 }
5980 /* copy substitution string */
5981 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005982 Py_UNICODE_COPY(p, str2->str, str2->length);
5983 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005984 }
5985 i = j + str1->length;
5986 }
5987 if (i < self->length)
5988 /* copy tail [i:] */
5989 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005990 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005991 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005992 while (n > 0) {
5993 Py_UNICODE_COPY(p, str2->str, str2->length);
5994 p += str2->length;
5995 if (--n <= 0)
5996 break;
5997 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005999 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 }
6001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006003
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006004 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006005 /* nothing to replace; return original string (when possible) */
6006 if (PyUnicode_CheckExact(self)) {
6007 Py_INCREF(self);
6008 return (PyObject *) self;
6009 }
6010 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011}
6012
6013/* --- Unicode Object Methods --------------------------------------------- */
6014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006015PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006016 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017\n\
6018Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006019characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
6021static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006022unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 return fixup(self, fixtitle);
6025}
6026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006027PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006028 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029\n\
6030Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00006031have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
6033static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006034unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 return fixup(self, fixcapitalize);
6037}
6038
6039#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006040PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006041 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042\n\
6043Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006044normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045
6046static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006047unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048{
6049 PyObject *list;
6050 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006051 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 /* Split into words */
6054 list = split(self, NULL, -1);
6055 if (!list)
6056 return NULL;
6057
6058 /* Capitalize each word */
6059 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6060 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006061 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 if (item == NULL)
6063 goto onError;
6064 Py_DECREF(PyList_GET_ITEM(list, i));
6065 PyList_SET_ITEM(list, i, item);
6066 }
6067
6068 /* Join the words to form a new string */
6069 item = PyUnicode_Join(NULL, list);
6070
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006071 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 Py_DECREF(list);
6073 return (PyObject *)item;
6074}
6075#endif
6076
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006077/* Argument converter. Coerces to a single unicode character */
6078
6079static int
6080convert_uc(PyObject *obj, void *addr)
6081{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006082 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6083 PyObject *uniobj;
6084 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006085
Benjamin Peterson857ce152009-01-31 16:29:18 +00006086 uniobj = PyUnicode_FromObject(obj);
6087 if (uniobj == NULL) {
6088 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006089 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006090 return 0;
6091 }
6092 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6093 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006094 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006095 Py_DECREF(uniobj);
6096 return 0;
6097 }
6098 unistr = PyUnicode_AS_UNICODE(uniobj);
6099 *fillcharloc = unistr[0];
6100 Py_DECREF(uniobj);
6101 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006102}
6103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006104PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006105 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006107Return S centered in a Unicode string of length width. Padding is\n\
6108done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109
6110static PyObject *
6111unicode_center(PyUnicodeObject *self, PyObject *args)
6112{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006113 Py_ssize_t marg, left;
6114 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006115 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116
Thomas Woutersde017742006-02-16 19:34:37 +00006117 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 return NULL;
6119
Tim Peters7a29bd52001-09-12 03:03:31 +00006120 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 Py_INCREF(self);
6122 return (PyObject*) self;
6123 }
6124
6125 marg = width - self->length;
6126 left = marg / 2 + (marg & width & 1);
6127
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006128 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129}
6130
Marc-André Lemburge5034372000-08-08 08:04:29 +00006131#if 0
6132
6133/* This code should go into some future Unicode collation support
6134 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006135 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006136
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006137/* speedy UTF-16 code point order comparison */
6138/* gleaned from: */
6139/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6140
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006141static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006142{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006143 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006144 0, 0, 0, 0, 0, 0, 0, 0,
6145 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006146 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006147};
6148
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149static int
6150unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6151{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006152 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006153
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 Py_UNICODE *s1 = str1->str;
6155 Py_UNICODE *s2 = str2->str;
6156
6157 len1 = str1->length;
6158 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006159
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006161 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006162
6163 c1 = *s1++;
6164 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006165
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006166 if (c1 > (1<<11) * 26)
6167 c1 += utf16Fixup[c1>>11];
6168 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006169 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006170 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006171
6172 if (c1 != c2)
6173 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006174
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006175 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 }
6177
6178 return (len1 < len2) ? -1 : (len1 != len2);
6179}
6180
Marc-André Lemburge5034372000-08-08 08:04:29 +00006181#else
6182
6183static int
6184unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6185{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006186 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006187
6188 Py_UNICODE *s1 = str1->str;
6189 Py_UNICODE *s2 = str2->str;
6190
6191 len1 = str1->length;
6192 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006193
Marc-André Lemburge5034372000-08-08 08:04:29 +00006194 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006195 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006196
Fredrik Lundh45714e92001-06-26 16:39:36 +00006197 c1 = *s1++;
6198 c2 = *s2++;
6199
6200 if (c1 != c2)
6201 return (c1 < c2) ? -1 : 1;
6202
Marc-André Lemburge5034372000-08-08 08:04:29 +00006203 len1--; len2--;
6204 }
6205
6206 return (len1 < len2) ? -1 : (len1 != len2);
6207}
6208
6209#endif
6210
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006212 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213{
6214 PyUnicodeObject *u = NULL, *v = NULL;
6215 int result;
6216
6217 /* Coerce the two arguments */
6218 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6219 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006220 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6222 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006223 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224
Thomas Wouters7e474022000-07-16 12:04:32 +00006225 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006227 Py_DECREF(u);
6228 Py_DECREF(v);
6229 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 }
6231
6232 result = unicode_compare(u, v);
6233
6234 Py_DECREF(u);
6235 Py_DECREF(v);
6236 return result;
6237
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006238 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 Py_XDECREF(u);
6240 Py_XDECREF(v);
6241 return -1;
6242}
6243
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006244PyObject *PyUnicode_RichCompare(PyObject *left,
6245 PyObject *right,
6246 int op)
6247{
6248 int result;
6249
6250 result = PyUnicode_Compare(left, right);
6251 if (result == -1 && PyErr_Occurred())
6252 goto onError;
6253
6254 /* Convert the return value to a Boolean */
6255 switch (op) {
6256 case Py_EQ:
6257 result = (result == 0);
6258 break;
6259 case Py_NE:
6260 result = (result != 0);
6261 break;
6262 case Py_LE:
6263 result = (result <= 0);
6264 break;
6265 case Py_GE:
6266 result = (result >= 0);
6267 break;
6268 case Py_LT:
6269 result = (result == -1);
6270 break;
6271 case Py_GT:
6272 result = (result == 1);
6273 break;
6274 }
6275 return PyBool_FromLong(result);
6276
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006277 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006278
6279 /* Standard case
6280
6281 Type errors mean that PyUnicode_FromObject() could not convert
6282 one of the arguments (usually the right hand side) to Unicode,
6283 ie. we can't handle the comparison request. However, it is
6284 possible that the other object knows a comparison method, which
6285 is why we return Py_NotImplemented to give the other object a
6286 chance.
6287
6288 */
6289 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6290 PyErr_Clear();
6291 Py_INCREF(Py_NotImplemented);
6292 return Py_NotImplemented;
6293 }
6294 if (op != Py_EQ && op != Py_NE)
6295 return NULL;
6296
6297 /* Equality comparison.
6298
6299 This is a special case: we silence any PyExc_UnicodeDecodeError
6300 and instead turn it into a PyErr_UnicodeWarning.
6301
6302 */
6303 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6304 return NULL;
6305 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006306 if (PyErr_Warn(PyExc_UnicodeWarning,
6307 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006308 "Unicode equal comparison "
6309 "failed to convert both arguments to Unicode - "
6310 "interpreting them as being unequal" :
6311 "Unicode unequal comparison "
6312 "failed to convert both arguments to Unicode - "
6313 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006314 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006315 return NULL;
6316 result = (op == Py_NE);
6317 return PyBool_FromLong(result);
6318}
6319
Guido van Rossum403d68b2000-03-13 15:55:09 +00006320int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006321 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006322{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006323 PyObject *str, *sub;
6324 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006325
6326 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006327 sub = PyUnicode_FromObject(element);
6328 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006329 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006330 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006331
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006332 str = PyUnicode_FromObject(container);
6333 if (!str) {
6334 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006335 return -1;
6336 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006337
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006338 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006339
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006340 Py_DECREF(str);
6341 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006342
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006343 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006344}
6345
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346/* Concat to string or Unicode object giving a new Unicode object. */
6347
6348PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006349 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350{
6351 PyUnicodeObject *u = NULL, *v = NULL, *w;
6352
6353 /* Coerce the two arguments */
6354 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6355 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006356 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6358 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006359 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360
6361 /* Shortcuts */
6362 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006363 Py_DECREF(v);
6364 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365 }
6366 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006367 Py_DECREF(u);
6368 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 }
6370
6371 /* Concat the two Unicode strings */
6372 w = _PyUnicode_New(u->length + v->length);
6373 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006374 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 Py_UNICODE_COPY(w->str, u->str, u->length);
6376 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6377
6378 Py_DECREF(u);
6379 Py_DECREF(v);
6380 return (PyObject *)w;
6381
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006382 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 Py_XDECREF(u);
6384 Py_XDECREF(v);
6385 return NULL;
6386}
6387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006388PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006389 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006391Return the number of non-overlapping occurrences of substring sub in\n\
6392Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006393interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394
6395static PyObject *
6396unicode_count(PyUnicodeObject *self, PyObject *args)
6397{
6398 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006399 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006400 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 PyObject *result;
6402
Jesus Cea44e81682011-04-20 16:39:15 +02006403 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6404 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006405 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006406
Antoine Pitrou64672132010-01-13 07:55:48 +00006407 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006408 result = PyInt_FromSsize_t(
6409 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006410 substring->str, substring->length,
6411 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006412 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413
6414 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006415
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 return result;
6417}
6418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006419PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006420 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006422Encodes S using the codec registered for encoding. encoding defaults\n\
6423to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006424handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6426'xmlcharrefreplace' as well as any other name registered with\n\
6427codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428
6429static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006430unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006432 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 char *encoding = NULL;
6434 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006435 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006436
Benjamin Peterson332d7212009-09-18 21:14:55 +00006437 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6438 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006440 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006441 if (v == NULL)
6442 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006443 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006444 PyErr_Format(PyExc_TypeError,
6445 "encoder did not return a string/unicode object "
6446 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006447 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006448 Py_DECREF(v);
6449 return NULL;
6450 }
6451 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006452
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006453 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006454 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006455}
6456
6457PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006458 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006459\n\
6460Decodes S using the codec registered for encoding. encoding defaults\n\
6461to the default encoding. errors may be given to set a different error\n\
6462handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6463a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006464as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006465able to handle UnicodeDecodeErrors.");
6466
6467static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006468unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006469{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006470 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006471 char *encoding = NULL;
6472 char *errors = NULL;
6473 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006474
Benjamin Peterson332d7212009-09-18 21:14:55 +00006475 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6476 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006477 return NULL;
6478 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006479 if (v == NULL)
6480 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006481 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006482 PyErr_Format(PyExc_TypeError,
6483 "decoder did not return a string/unicode object "
6484 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006485 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006486 Py_DECREF(v);
6487 return NULL;
6488 }
6489 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006490
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006491 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006492 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493}
6494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006495PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006496 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497\n\
6498Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006499If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500
6501static PyObject*
6502unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6503{
6504 Py_UNICODE *e;
6505 Py_UNICODE *p;
6506 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006507 Py_UNICODE *qe;
6508 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 PyUnicodeObject *u;
6510 int tabsize = 8;
6511
6512 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514
Thomas Wouters7e474022000-07-16 12:04:32 +00006515 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006516 i = 0; /* chars up to and including most recent \n or \r */
6517 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6518 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 for (p = self->str; p < e; p++)
6520 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006521 if (tabsize > 0) {
6522 incr = tabsize - (j % tabsize); /* cannot overflow */
6523 if (j > PY_SSIZE_T_MAX - incr)
6524 goto overflow1;
6525 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006526 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006529 if (j > PY_SSIZE_T_MAX - 1)
6530 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 j++;
6532 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006533 if (i > PY_SSIZE_T_MAX - j)
6534 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006536 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 }
6538 }
6539
Guido van Rossum5bdff602008-03-11 21:18:06 +00006540 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006541 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006542
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 /* Second pass: create output string and fill it */
6544 u = _PyUnicode_New(i + j);
6545 if (!u)
6546 return NULL;
6547
Guido van Rossum5bdff602008-03-11 21:18:06 +00006548 j = 0; /* same as in first pass */
6549 q = u->str; /* next output char */
6550 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551
6552 for (p = self->str; p < e; p++)
6553 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006554 if (tabsize > 0) {
6555 i = tabsize - (j % tabsize);
6556 j += i;
6557 while (i--) {
6558 if (q >= qe)
6559 goto overflow2;
6560 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006561 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006562 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006563 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006564 else {
6565 if (q >= qe)
6566 goto overflow2;
6567 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006568 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 if (*p == '\n' || *p == '\r')
6570 j = 0;
6571 }
6572
6573 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006574
6575 overflow2:
6576 Py_DECREF(u);
6577 overflow1:
6578 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6579 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580}
6581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006582PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006583 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584\n\
6585Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006586such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587arguments start and end are interpreted as in slice notation.\n\
6588\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006589Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590
6591static PyObject *
6592unicode_find(PyUnicodeObject *self, PyObject *args)
6593{
Jesus Cea44e81682011-04-20 16:39:15 +02006594 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006595 Py_ssize_t start;
6596 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006597 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598
Jesus Cea44e81682011-04-20 16:39:15 +02006599 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6600 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006603 result = stringlib_find_slice(
6604 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6605 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6606 start, end
6607 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608
6609 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006610
6611 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612}
6613
6614static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006615unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616{
6617 if (index < 0 || index >= self->length) {
6618 PyErr_SetString(PyExc_IndexError, "string index out of range");
6619 return NULL;
6620 }
6621
6622 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6623}
6624
6625static long
6626unicode_hash(PyUnicodeObject *self)
6627{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006628 /* Since Unicode objects compare equal to their ASCII string
6629 counterparts, they should use the individual character values
6630 as basis for their hash value. This is needed to assure that
6631 strings and Unicode objects behave in the same way as
6632 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633
Martin v. Löwis18e16552006-02-15 17:27:45 +00006634 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006635 register Py_UNICODE *p;
6636 register long x;
6637
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006638#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006639 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006640#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006642 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006643 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006644 /*
6645 We make the hash of the empty string be 0, rather than using
6646 (prefix ^ suffix), since this slightly obfuscates the hash secret
6647 */
6648 if (len == 0) {
6649 self->hash = 0;
6650 return 0;
6651 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006652 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006653 x = _Py_HashSecret.prefix;
6654 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006655 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006656 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006657 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006658 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006659 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006660 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006661 self->hash = x;
6662 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663}
6664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006665PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006666 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006668Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669
6670static PyObject *
6671unicode_index(PyUnicodeObject *self, PyObject *args)
6672{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006673 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006674 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006675 Py_ssize_t start;
6676 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677
Jesus Cea44e81682011-04-20 16:39:15 +02006678 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6679 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006682 result = stringlib_find_slice(
6683 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6684 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6685 start, end
6686 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687
6688 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006689
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 if (result < 0) {
6691 PyErr_SetString(PyExc_ValueError, "substring not found");
6692 return NULL;
6693 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006694
Martin v. Löwis18e16552006-02-15 17:27:45 +00006695 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696}
6697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006698PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006699 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006701Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006702at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703
6704static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006705unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706{
6707 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6708 register const Py_UNICODE *e;
6709 int cased;
6710
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 /* Shortcut for single character strings */
6712 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006713 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006715 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006716 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006717 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006718
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719 e = p + PyUnicode_GET_SIZE(self);
6720 cased = 0;
6721 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006722 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006723
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006724 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6725 return PyBool_FromLong(0);
6726 else if (!cased && Py_UNICODE_ISLOWER(ch))
6727 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006729 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730}
6731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006732PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006733 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006735Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006736at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737
6738static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006739unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740{
6741 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6742 register const Py_UNICODE *e;
6743 int cased;
6744
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 /* Shortcut for single character strings */
6746 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006747 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006749 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006750 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006751 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006752
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 e = p + PyUnicode_GET_SIZE(self);
6754 cased = 0;
6755 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006756 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006757
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006758 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6759 return PyBool_FromLong(0);
6760 else if (!cased && Py_UNICODE_ISUPPER(ch))
6761 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006763 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764}
6765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006766PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006767 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006769Return True if S is a titlecased string and there is at least one\n\
6770character in S, i.e. upper- and titlecase characters may only\n\
6771follow uncased characters and lowercase characters only cased ones.\n\
6772Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773
6774static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006775unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776{
6777 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6778 register const Py_UNICODE *e;
6779 int cased, previous_is_cased;
6780
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781 /* Shortcut for single character strings */
6782 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006783 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6784 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006786 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006787 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006788 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006789
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 e = p + PyUnicode_GET_SIZE(self);
6791 cased = 0;
6792 previous_is_cased = 0;
6793 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006794 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006795
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006796 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6797 if (previous_is_cased)
6798 return PyBool_FromLong(0);
6799 previous_is_cased = 1;
6800 cased = 1;
6801 }
6802 else if (Py_UNICODE_ISLOWER(ch)) {
6803 if (!previous_is_cased)
6804 return PyBool_FromLong(0);
6805 previous_is_cased = 1;
6806 cased = 1;
6807 }
6808 else
6809 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006811 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812}
6813
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006814PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006815 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006817Return True if all characters in S are whitespace\n\
6818and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819
6820static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006821unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822{
6823 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6824 register const Py_UNICODE *e;
6825
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 /* Shortcut for single character strings */
6827 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006828 Py_UNICODE_ISSPACE(*p))
6829 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006831 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006832 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006833 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006834
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 e = p + PyUnicode_GET_SIZE(self);
6836 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006837 if (!Py_UNICODE_ISSPACE(*p))
6838 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006840 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841}
6842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006843PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006844 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006845\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006846Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006847and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006848
6849static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006850unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006851{
6852 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6853 register const Py_UNICODE *e;
6854
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006855 /* Shortcut for single character strings */
6856 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006857 Py_UNICODE_ISALPHA(*p))
6858 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006859
6860 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006861 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006862 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006863
6864 e = p + PyUnicode_GET_SIZE(self);
6865 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006866 if (!Py_UNICODE_ISALPHA(*p))
6867 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006868 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006869 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006870}
6871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006872PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006873 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006874\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006875Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006876and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006877
6878static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006879unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006880{
6881 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6882 register const Py_UNICODE *e;
6883
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006884 /* Shortcut for single character strings */
6885 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006886 Py_UNICODE_ISALNUM(*p))
6887 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006888
6889 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006890 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006891 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006892
6893 e = p + PyUnicode_GET_SIZE(self);
6894 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006895 if (!Py_UNICODE_ISALNUM(*p))
6896 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006897 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006898 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006899}
6900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006901PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006902 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006904Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006905False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906
6907static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006908unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909{
6910 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6911 register const Py_UNICODE *e;
6912
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 /* Shortcut for single character strings */
6914 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006915 Py_UNICODE_ISDECIMAL(*p))
6916 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006918 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006919 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006920 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006921
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 e = p + PyUnicode_GET_SIZE(self);
6923 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006924 if (!Py_UNICODE_ISDECIMAL(*p))
6925 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006927 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928}
6929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006930PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006931 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006933Return True if all characters in S are digits\n\
6934and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935
6936static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006937unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938{
6939 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6940 register const Py_UNICODE *e;
6941
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 /* Shortcut for single character strings */
6943 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006944 Py_UNICODE_ISDIGIT(*p))
6945 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006947 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006948 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006949 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006950
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 e = p + PyUnicode_GET_SIZE(self);
6952 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006953 if (!Py_UNICODE_ISDIGIT(*p))
6954 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006956 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957}
6958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006959PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006960 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006962Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006963False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964
6965static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006966unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967{
6968 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6969 register const Py_UNICODE *e;
6970
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 /* Shortcut for single character strings */
6972 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006973 Py_UNICODE_ISNUMERIC(*p))
6974 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006976 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006977 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006978 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006979
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980 e = p + PyUnicode_GET_SIZE(self);
6981 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006982 if (!Py_UNICODE_ISNUMERIC(*p))
6983 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006985 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986}
6987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006988PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006989 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990\n\
6991Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006992iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993
6994static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006995unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006997 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998}
6999
Martin v. Löwis18e16552006-02-15 17:27:45 +00007000static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001unicode_length(PyUnicodeObject *self)
7002{
7003 return self->length;
7004}
7005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007006PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007007 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007009Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007010done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011
7012static PyObject *
7013unicode_ljust(PyUnicodeObject *self, PyObject *args)
7014{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007015 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007016 Py_UNICODE fillchar = ' ';
7017
Martin v. Löwis412fb672006-04-13 06:34:32 +00007018 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 return NULL;
7020
Tim Peters7a29bd52001-09-12 03:03:31 +00007021 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 Py_INCREF(self);
7023 return (PyObject*) self;
7024 }
7025
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007026 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027}
7028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007029PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007030 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007032Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033
7034static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007035unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 return fixup(self, fixlower);
7038}
7039
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007040#define LEFTSTRIP 0
7041#define RIGHTSTRIP 1
7042#define BOTHSTRIP 2
7043
7044/* Arrays indexed by above */
7045static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7046
7047#define STRIPNAME(i) (stripformat[i]+3)
7048
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007049/* externally visible for str.strip(unicode) */
7050PyObject *
7051_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7052{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007053 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7054 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7055 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7056 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7057 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007058
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007059 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007060
Benjamin Peterson857ce152009-01-31 16:29:18 +00007061 i = 0;
7062 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007063 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7064 i++;
7065 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007066 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007067
Benjamin Peterson857ce152009-01-31 16:29:18 +00007068 j = len;
7069 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007070 do {
7071 j--;
7072 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7073 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007074 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007075
Benjamin Peterson857ce152009-01-31 16:29:18 +00007076 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007077 Py_INCREF(self);
7078 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007079 }
7080 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007081 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007082}
7083
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084
7085static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007088 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7089 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007090
Benjamin Peterson857ce152009-01-31 16:29:18 +00007091 i = 0;
7092 if (striptype != RIGHTSTRIP) {
7093 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7094 i++;
7095 }
7096 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007097
Benjamin Peterson857ce152009-01-31 16:29:18 +00007098 j = len;
7099 if (striptype != LEFTSTRIP) {
7100 do {
7101 j--;
7102 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7103 j++;
7104 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007105
Benjamin Peterson857ce152009-01-31 16:29:18 +00007106 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7107 Py_INCREF(self);
7108 return (PyObject*)self;
7109 }
7110 else
7111 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112}
7113
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007114
7115static PyObject *
7116do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7117{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007118 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007119
Benjamin Peterson857ce152009-01-31 16:29:18 +00007120 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7121 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007122
Benjamin Peterson857ce152009-01-31 16:29:18 +00007123 if (sep != NULL && sep != Py_None) {
7124 if (PyUnicode_Check(sep))
7125 return _PyUnicode_XStrip(self, striptype, sep);
7126 else if (PyString_Check(sep)) {
7127 PyObject *res;
7128 sep = PyUnicode_FromObject(sep);
7129 if (sep==NULL)
7130 return NULL;
7131 res = _PyUnicode_XStrip(self, striptype, sep);
7132 Py_DECREF(sep);
7133 return res;
7134 }
7135 else {
7136 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007137 "%s arg must be None, unicode or str",
7138 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007139 return NULL;
7140 }
7141 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007142
Benjamin Peterson857ce152009-01-31 16:29:18 +00007143 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007144}
7145
7146
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007147PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007148 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007149\n\
7150Return a copy of the string S with leading and trailing\n\
7151whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007152If chars is given and not None, remove characters in chars instead.\n\
7153If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007154
7155static PyObject *
7156unicode_strip(PyUnicodeObject *self, PyObject *args)
7157{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007158 if (PyTuple_GET_SIZE(args) == 0)
7159 return do_strip(self, BOTHSTRIP); /* Common case */
7160 else
7161 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007162}
7163
7164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007165PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007166 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007167\n\
7168Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007169If chars is given and not None, remove characters in chars instead.\n\
7170If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007171
7172static PyObject *
7173unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7174{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007175 if (PyTuple_GET_SIZE(args) == 0)
7176 return do_strip(self, LEFTSTRIP); /* Common case */
7177 else
7178 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007179}
7180
7181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007182PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007183 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007184\n\
7185Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007186If chars is given and not None, remove characters in chars instead.\n\
7187If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007188
7189static PyObject *
7190unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7191{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007192 if (PyTuple_GET_SIZE(args) == 0)
7193 return do_strip(self, RIGHTSTRIP); /* Common case */
7194 else
7195 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007196}
7197
7198
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007200unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201{
7202 PyUnicodeObject *u;
7203 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007204 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007205 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206
7207 if (len < 0)
7208 len = 0;
7209
Tim Peters7a29bd52001-09-12 03:03:31 +00007210 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 /* no repeat, return original string */
7212 Py_INCREF(str);
7213 return (PyObject*) str;
7214 }
Tim Peters8f422462000-09-09 06:13:41 +00007215
7216 /* ensure # of chars needed doesn't overflow int and # of bytes
7217 * needed doesn't overflow size_t
7218 */
7219 nchars = len * str->length;
7220 if (len && nchars / len != str->length) {
7221 PyErr_SetString(PyExc_OverflowError,
7222 "repeated string is too long");
7223 return NULL;
7224 }
7225 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7226 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7227 PyErr_SetString(PyExc_OverflowError,
7228 "repeated string is too long");
7229 return NULL;
7230 }
7231 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 if (!u)
7233 return NULL;
7234
7235 p = u->str;
7236
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007237 if (str->length == 1 && len > 0) {
7238 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007239 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007240 Py_ssize_t done = 0; /* number of characters copied this far */
7241 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007242 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007243 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007244 }
7245 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007246 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007247 Py_UNICODE_COPY(p+done, p, n);
7248 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007249 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251
7252 return (PyObject*) u;
7253}
7254
7255PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007256 PyObject *subobj,
7257 PyObject *replobj,
7258 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259{
7260 PyObject *self;
7261 PyObject *str1;
7262 PyObject *str2;
7263 PyObject *result;
7264
7265 self = PyUnicode_FromObject(obj);
7266 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 str1 = PyUnicode_FromObject(subobj);
7269 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007270 Py_DECREF(self);
7271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272 }
7273 str2 = PyUnicode_FromObject(replobj);
7274 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007275 Py_DECREF(self);
7276 Py_DECREF(str1);
7277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278 }
Tim Petersced69f82003-09-16 20:30:58 +00007279 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007280 (PyUnicodeObject *)str1,
7281 (PyUnicodeObject *)str2,
7282 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 Py_DECREF(self);
7284 Py_DECREF(str1);
7285 Py_DECREF(str2);
7286 return result;
7287}
7288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007289PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007290 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291\n\
7292Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007293old replaced by new. If the optional argument count is\n\
7294given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295
7296static PyObject*
7297unicode_replace(PyUnicodeObject *self, PyObject *args)
7298{
7299 PyUnicodeObject *str1;
7300 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007301 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 PyObject *result;
7303
Martin v. Löwis18e16552006-02-15 17:27:45 +00007304 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305 return NULL;
7306 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7307 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007310 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007311 Py_DECREF(str1);
7312 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007313 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314
7315 result = replace(self, str1, str2, maxcount);
7316
7317 Py_DECREF(str1);
7318 Py_DECREF(str2);
7319 return result;
7320}
7321
7322static
7323PyObject *unicode_repr(PyObject *unicode)
7324{
7325 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007326 PyUnicode_GET_SIZE(unicode),
7327 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328}
7329
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007330PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007331 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332\n\
7333Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007334such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335arguments start and end are interpreted as in slice notation.\n\
7336\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007337Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338
7339static PyObject *
7340unicode_rfind(PyUnicodeObject *self, PyObject *args)
7341{
Jesus Cea44e81682011-04-20 16:39:15 +02007342 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007343 Py_ssize_t start;
7344 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007345 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346
Jesus Cea44e81682011-04-20 16:39:15 +02007347 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7348 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007349 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007351 result = stringlib_rfind_slice(
7352 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7353 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7354 start, end
7355 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356
7357 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007358
7359 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360}
7361
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007362PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007363 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007365Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366
7367static PyObject *
7368unicode_rindex(PyUnicodeObject *self, PyObject *args)
7369{
Jesus Cea44e81682011-04-20 16:39:15 +02007370 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007371 Py_ssize_t start;
7372 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007373 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374
Jesus Cea44e81682011-04-20 16:39:15 +02007375 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7376 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007379 result = stringlib_rfind_slice(
7380 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7381 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7382 start, end
7383 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384
7385 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007386
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387 if (result < 0) {
7388 PyErr_SetString(PyExc_ValueError, "substring not found");
7389 return NULL;
7390 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007391 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392}
7393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007394PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007395 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007397Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007398done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399
7400static PyObject *
7401unicode_rjust(PyUnicodeObject *self, PyObject *args)
7402{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007403 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007404 Py_UNICODE fillchar = ' ';
7405
Martin v. Löwis412fb672006-04-13 06:34:32 +00007406 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407 return NULL;
7408
Tim Peters7a29bd52001-09-12 03:03:31 +00007409 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410 Py_INCREF(self);
7411 return (PyObject*) self;
7412 }
7413
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007414 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415}
7416
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007418unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419{
7420 /* standard clamping */
7421 if (start < 0)
7422 start = 0;
7423 if (end < 0)
7424 end = 0;
7425 if (end > self->length)
7426 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007427 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428 /* full slice, return original string */
7429 Py_INCREF(self);
7430 return (PyObject*) self;
7431 }
7432 if (start > end)
7433 start = end;
7434 /* copy slice */
7435 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007436 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437}
7438
7439PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007440 PyObject *sep,
7441 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442{
7443 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007444
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 s = PyUnicode_FromObject(s);
7446 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007447 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007448 if (sep != NULL) {
7449 sep = PyUnicode_FromObject(sep);
7450 if (sep == NULL) {
7451 Py_DECREF(s);
7452 return NULL;
7453 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 }
7455
7456 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7457
7458 Py_DECREF(s);
7459 Py_XDECREF(sep);
7460 return result;
7461}
7462
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007463PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007464 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465\n\
7466Return a list of the words in S, using sep as the\n\
7467delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007468splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007469whitespace string is a separator and empty strings are\n\
7470removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
7472static PyObject*
7473unicode_split(PyUnicodeObject *self, PyObject *args)
7474{
7475 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007476 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477
Martin v. Löwis18e16552006-02-15 17:27:45 +00007478 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 return NULL;
7480
7481 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007482 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007484 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007486 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487}
7488
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007489PyObject *
7490PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7491{
7492 PyObject* str_obj;
7493 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007494 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007495
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007496 str_obj = PyUnicode_FromObject(str_in);
7497 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007498 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007499 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007500 if (!sep_obj) {
7501 Py_DECREF(str_obj);
7502 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007503 }
7504
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007505 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007506 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7507 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7508 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007509
Fredrik Lundhb9479482006-05-26 17:22:38 +00007510 Py_DECREF(sep_obj);
7511 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007512
7513 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007514}
7515
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007516
7517PyObject *
7518PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7519{
7520 PyObject* str_obj;
7521 PyObject* sep_obj;
7522 PyObject* out;
7523
7524 str_obj = PyUnicode_FromObject(str_in);
7525 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007526 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007527 sep_obj = PyUnicode_FromObject(sep_in);
7528 if (!sep_obj) {
7529 Py_DECREF(str_obj);
7530 return NULL;
7531 }
7532
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007533 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007534 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7535 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7536 );
7537
7538 Py_DECREF(sep_obj);
7539 Py_DECREF(str_obj);
7540
7541 return out;
7542}
7543
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007544PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007545 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007546\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007547Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007548the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007549found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007550
7551static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007552unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007553{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007554 return PyUnicode_Partition((PyObject *)self, separator);
7555}
7556
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007557PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007558 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007559\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007560Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007561the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007562separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007563
7564static PyObject*
7565unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7566{
7567 return PyUnicode_RPartition((PyObject *)self, separator);
7568}
7569
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007570PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007571 PyObject *sep,
7572 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007573{
7574 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007575
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007576 s = PyUnicode_FromObject(s);
7577 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007578 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007579 if (sep != NULL) {
7580 sep = PyUnicode_FromObject(sep);
7581 if (sep == NULL) {
7582 Py_DECREF(s);
7583 return NULL;
7584 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007585 }
7586
7587 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7588
7589 Py_DECREF(s);
7590 Py_XDECREF(sep);
7591 return result;
7592}
7593
7594PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007595 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007596\n\
7597Return a list of the words in S, using sep as the\n\
7598delimiter string, starting at the end of the string and\n\
7599working to the front. If maxsplit is given, at most maxsplit\n\
7600splits are done. If sep is not specified, any whitespace string\n\
7601is a separator.");
7602
7603static PyObject*
7604unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7605{
7606 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007607 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007608
Martin v. Löwis18e16552006-02-15 17:27:45 +00007609 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007610 return NULL;
7611
7612 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007613 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007614 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007615 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007616 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007617 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007618}
7619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007620PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007621 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622\n\
7623Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007624Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007625is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626
7627static PyObject*
7628unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7629{
Guido van Rossum86662912000-04-11 15:38:46 +00007630 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631
Guido van Rossum86662912000-04-11 15:38:46 +00007632 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633 return NULL;
7634
Guido van Rossum86662912000-04-11 15:38:46 +00007635 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636}
7637
7638static
7639PyObject *unicode_str(PyUnicodeObject *self)
7640{
Fred Drakee4315f52000-05-09 19:53:39 +00007641 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642}
7643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007644PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007645 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646\n\
7647Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007648and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649
7650static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007651unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653 return fixup(self, fixswapcase);
7654}
7655
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007656PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007657 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658\n\
7659Return a copy of the string S, where all characters have been mapped\n\
7660through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007661Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7662Unmapped characters are left untouched. Characters mapped to None\n\
7663are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664
7665static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007666unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667{
Tim Petersced69f82003-09-16 20:30:58 +00007668 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007669 self->length,
7670 table,
7671 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672}
7673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007674PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007675 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007677Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678
7679static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007680unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 return fixup(self, fixupper);
7683}
7684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007685PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007686 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687\n\
Georg Brandl98064072008-09-09 19:26:00 +00007688Pad a numeric string S with zeros on the left, to fill a field\n\
7689of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690
7691static PyObject *
7692unicode_zfill(PyUnicodeObject *self, PyObject *args)
7693{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007694 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695 PyUnicodeObject *u;
7696
Martin v. Löwis18e16552006-02-15 17:27:45 +00007697 Py_ssize_t width;
7698 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699 return NULL;
7700
7701 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007702 if (PyUnicode_CheckExact(self)) {
7703 Py_INCREF(self);
7704 return (PyObject*) self;
7705 }
7706 else
7707 return PyUnicode_FromUnicode(
7708 PyUnicode_AS_UNICODE(self),
7709 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007710 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 }
7712
7713 fill = width - self->length;
7714
7715 u = pad(self, fill, 0, '0');
7716
Walter Dörwald068325e2002-04-15 13:36:47 +00007717 if (u == NULL)
7718 return NULL;
7719
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 if (u->str[fill] == '+' || u->str[fill] == '-') {
7721 /* move sign to beginning of string */
7722 u->str[0] = u->str[fill];
7723 u->str[fill] = '0';
7724 }
7725
7726 return (PyObject*) u;
7727}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728
7729#if 0
7730static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007731free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007733 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734}
7735#endif
7736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007737PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007738 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007740Return True if S starts with the specified prefix, False otherwise.\n\
7741With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007742With optional end, stop comparing S at that position.\n\
7743prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744
7745static PyObject *
7746unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007747 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748{
Georg Brandl24250812006-06-09 18:45:48 +00007749 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007751 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007752 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007753 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754
Jesus Cea44e81682011-04-20 16:39:15 +02007755 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007756 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007757 if (PyTuple_Check(subobj)) {
7758 Py_ssize_t i;
7759 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7760 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007761 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007762 if (substring == NULL)
7763 return NULL;
7764 result = tailmatch(self, substring, start, end, -1);
7765 Py_DECREF(substring);
7766 if (result) {
7767 Py_RETURN_TRUE;
7768 }
7769 }
7770 /* nothing matched */
7771 Py_RETURN_FALSE;
7772 }
7773 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007774 if (substring == NULL) {
7775 if (PyErr_ExceptionMatches(PyExc_TypeError))
7776 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7777 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007778 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007779 }
Georg Brandl24250812006-06-09 18:45:48 +00007780 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007782 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783}
7784
7785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007786PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007787 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007789Return True if S ends with the specified suffix, False otherwise.\n\
7790With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007791With optional end, stop comparing S at that position.\n\
7792suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793
7794static PyObject *
7795unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007796 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797{
Georg Brandl24250812006-06-09 18:45:48 +00007798 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007800 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007801 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007802 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803
Jesus Cea44e81682011-04-20 16:39:15 +02007804 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007805 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007806 if (PyTuple_Check(subobj)) {
7807 Py_ssize_t i;
7808 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7809 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007810 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007811 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007812 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007813 result = tailmatch(self, substring, start, end, +1);
7814 Py_DECREF(substring);
7815 if (result) {
7816 Py_RETURN_TRUE;
7817 }
7818 }
7819 Py_RETURN_FALSE;
7820 }
7821 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007822 if (substring == NULL) {
7823 if (PyErr_ExceptionMatches(PyExc_TypeError))
7824 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7825 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007826 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007827 }
Georg Brandl24250812006-06-09 18:45:48 +00007828 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007830 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831}
7832
7833
Eric Smitha9f7d622008-02-17 19:46:49 +00007834/* Implements do_string_format, which is unicode because of stringlib */
7835#include "stringlib/string_format.h"
7836
7837PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007838 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007839\n\
Eric Smith6c840852010-11-06 19:43:44 +00007840Return a formatted version of S, using substitutions from args and kwargs.\n\
7841The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007842
Eric Smithdc13b792008-05-30 18:10:04 +00007843static PyObject *
7844unicode__format__(PyObject *self, PyObject *args)
7845{
7846 PyObject *format_spec;
7847 PyObject *result = NULL;
7848 PyObject *tmp = NULL;
7849
7850 /* If 2.x, convert format_spec to the same type as value */
7851 /* This is to allow things like u''.format('') */
7852 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7853 goto done;
7854 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7855 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007856 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007857 goto done;
7858 }
7859 tmp = PyObject_Unicode(format_spec);
7860 if (tmp == NULL)
7861 goto done;
7862 format_spec = tmp;
7863
7864 result = _PyUnicode_FormatAdvanced(self,
7865 PyUnicode_AS_UNICODE(format_spec),
7866 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007867 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007868 Py_XDECREF(tmp);
7869 return result;
7870}
7871
Eric Smitha9f7d622008-02-17 19:46:49 +00007872PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007873 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007874\n\
Eric Smith6c840852010-11-06 19:43:44 +00007875Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007876
Robert Schuppenies901c9972008-06-10 10:10:31 +00007877static PyObject *
7878unicode__sizeof__(PyUnicodeObject *v)
7879{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007880 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7881 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007882}
7883
7884PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007885 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007886\n\
7887");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007888
7889static PyObject *
7890unicode_getnewargs(PyUnicodeObject *v)
7891{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007892 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007893}
7894
7895
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007897 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007898 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7899 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007900 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007901 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7902 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7903 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7904 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7905 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7906 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7907 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007908 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007909 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7910 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7911 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007912 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007913 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007914/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7915 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7916 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7917 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007918 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007919 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007920 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007921 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007922 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7923 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7924 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7925 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7926 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7927 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7928 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7929 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7930 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7931 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7932 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7933 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7934 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7935 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007936 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007937 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7938 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7939 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7940 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007941 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007942#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007943 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944#endif
7945
7946#if 0
7947 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007948 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949#endif
7950
Benjamin Peterson857ce152009-01-31 16:29:18 +00007951 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 {NULL, NULL}
7953};
7954
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007955static PyObject *
7956unicode_mod(PyObject *v, PyObject *w)
7957{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007958 if (!PyUnicode_Check(v)) {
7959 Py_INCREF(Py_NotImplemented);
7960 return Py_NotImplemented;
7961 }
7962 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007963}
7964
7965static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007966 0, /*nb_add*/
7967 0, /*nb_subtract*/
7968 0, /*nb_multiply*/
7969 0, /*nb_divide*/
7970 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007971};
7972
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007974 (lenfunc) unicode_length, /* sq_length */
7975 PyUnicode_Concat, /* sq_concat */
7976 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7977 (ssizeargfunc) unicode_getitem, /* sq_item */
7978 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7979 0, /* sq_ass_item */
7980 0, /* sq_ass_slice */
7981 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982};
7983
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007984static PyObject*
7985unicode_subscript(PyUnicodeObject* self, PyObject* item)
7986{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007987 if (PyIndex_Check(item)) {
7988 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007989 if (i == -1 && PyErr_Occurred())
7990 return NULL;
7991 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007992 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007993 return unicode_getitem(self, i);
7994 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007995 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007996 Py_UNICODE* source_buf;
7997 Py_UNICODE* result_buf;
7998 PyObject* result;
7999
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008000 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008001 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008002 return NULL;
8003 }
8004
8005 if (slicelength <= 0) {
8006 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008007 } else if (start == 0 && step == 1 && slicelength == self->length &&
8008 PyUnicode_CheckExact(self)) {
8009 Py_INCREF(self);
8010 return (PyObject *)self;
8011 } else if (step == 1) {
8012 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008013 } else {
8014 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008015 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8016 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008017
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008018 if (result_buf == NULL)
8019 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008020
8021 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8022 result_buf[i] = source_buf[cur];
8023 }
Tim Petersced69f82003-09-16 20:30:58 +00008024
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008025 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008026 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008027 return result;
8028 }
8029 } else {
8030 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8031 return NULL;
8032 }
8033}
8034
8035static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008036 (lenfunc)unicode_length, /* mp_length */
8037 (binaryfunc)unicode_subscript, /* mp_subscript */
8038 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008039};
8040
Martin v. Löwis18e16552006-02-15 17:27:45 +00008041static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008043 Py_ssize_t index,
8044 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045{
8046 if (index != 0) {
8047 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008048 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 return -1;
8050 }
8051 *ptr = (void *) self->str;
8052 return PyUnicode_GET_DATA_SIZE(self);
8053}
8054
Martin v. Löwis18e16552006-02-15 17:27:45 +00008055static Py_ssize_t
8056unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008057 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058{
8059 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008060 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061 return -1;
8062}
8063
8064static int
8065unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008066 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067{
8068 if (lenp)
8069 *lenp = PyUnicode_GET_DATA_SIZE(self);
8070 return 1;
8071}
8072
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008073static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008075 Py_ssize_t index,
8076 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077{
8078 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008079
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080 if (index != 0) {
8081 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008082 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 return -1;
8084 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008085 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008087 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008088 *ptr = (void *) PyString_AS_STRING(str);
8089 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090}
8091
8092/* Helpers for PyUnicode_Format() */
8093
8094static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008095getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008097 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008099 (*p_argidx)++;
8100 if (arglen < 0)
8101 return args;
8102 else
8103 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104 }
8105 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008106 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 return NULL;
8108}
8109
8110#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008111#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008113#define F_ALT (1<<3)
8114#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115
Martin v. Löwis18e16552006-02-15 17:27:45 +00008116static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008117strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008119 register Py_ssize_t i;
8120 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008122 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 return len;
8125}
8126
Neal Norwitzfc76d632006-01-10 06:03:13 +00008127static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008128longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8129{
Tim Peters15231542006-02-16 01:08:01 +00008130 Py_ssize_t result;
8131
Neal Norwitzfc76d632006-01-10 06:03:13 +00008132 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008133 result = strtounicode(buffer, (char *)buffer);
8134 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008135}
8136
Guido van Rossum078151d2002-08-11 04:24:12 +00008137/* XXX To save some code duplication, formatfloat/long/int could have been
8138 shared with stringobject.c, converting from 8-bit to Unicode after the
8139 formatting is done. */
8140
Mark Dickinson18cfada2009-11-23 18:46:41 +00008141/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8142
8143static PyObject *
8144formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008146 char *p;
8147 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 x = PyFloat_AsDouble(v);
8151 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008152 return NULL;
8153
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008155 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008156
Mark Dickinson18cfada2009-11-23 18:46:41 +00008157 p = PyOS_double_to_string(x, type, prec,
8158 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8159 if (p == NULL)
8160 return NULL;
8161 result = PyUnicode_FromStringAndSize(p, strlen(p));
8162 PyMem_Free(p);
8163 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164}
8165
Tim Peters38fd5b62000-09-21 05:43:11 +00008166static PyObject*
8167formatlong(PyObject *val, int flags, int prec, int type)
8168{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008169 char *buf;
8170 int i, len;
8171 PyObject *str; /* temporary string object. */
8172 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008173
Benjamin Peterson857ce152009-01-31 16:29:18 +00008174 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8175 if (!str)
8176 return NULL;
8177 result = _PyUnicode_New(len);
8178 if (!result) {
8179 Py_DECREF(str);
8180 return NULL;
8181 }
8182 for (i = 0; i < len; i++)
8183 result->str[i] = buf[i];
8184 result->str[len] = 0;
8185 Py_DECREF(str);
8186 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008187}
8188
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189static int
8190formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008191 size_t buflen,
8192 int flags,
8193 int prec,
8194 int type,
8195 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008197 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008198 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8199 * + 1 + 1
8200 * = 24
8201 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008202 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008203 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204 long x;
8205
8206 x = PyInt_AsLong(v);
8207 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008208 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008209 if (x < 0 && type == 'u') {
8210 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008211 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008212 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8213 sign = "-";
8214 else
8215 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008217 prec = 1;
8218
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008219 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8220 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008221 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008222 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008223 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008224 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008225 return -1;
8226 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008227
8228 if ((flags & F_ALT) &&
8229 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008230 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008231 * of issues that cause pain:
8232 * - when 0 is being converted, the C standard leaves off
8233 * the '0x' or '0X', which is inconsistent with other
8234 * %#x/%#X conversions and inconsistent with Python's
8235 * hex() function
8236 * - there are platforms that violate the standard and
8237 * convert 0 with the '0x' or '0X'
8238 * (Metrowerks, Compaq Tru64)
8239 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008240 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008241 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008242 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008243 * We can achieve the desired consistency by inserting our
8244 * own '0x' or '0X' prefix, and substituting %x/%X in place
8245 * of %#x/%#X.
8246 *
8247 * Note that this is the same approach as used in
8248 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008249 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008250 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8251 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008252 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008253 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008254 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8255 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008256 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008257 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008258 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008259 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008260 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008261 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262}
8263
8264static int
8265formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008266 size_t buflen,
8267 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268{
Ezio Melotti32125152010-02-25 17:36:04 +00008269 PyObject *unistr;
8270 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008271 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008272 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008273 if (PyUnicode_GET_SIZE(v) != 1)
8274 goto onError;
8275 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008276 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008278 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008279 if (PyString_GET_SIZE(v) != 1)
8280 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008281 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8282 with a UnicodeDecodeError if 'char' is not decodable with the
8283 default encoding (usually ASCII, but it might be something else) */
8284 str = PyString_AS_STRING(v);
8285 if ((unsigned char)str[0] > 0x7F) {
8286 /* the char is not ASCII; try to decode the string using the
8287 default encoding and return -1 to let the UnicodeDecodeError
8288 be raised if the string can't be decoded */
8289 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8290 if (unistr == NULL)
8291 return -1;
8292 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8293 Py_DECREF(unistr);
8294 }
8295 else
8296 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008297 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298
8299 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008300 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008302 x = PyInt_AsLong(v);
8303 if (x == -1 && PyErr_Occurred())
8304 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008305#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008306 if (x < 0 || x > 0x10ffff) {
8307 PyErr_SetString(PyExc_OverflowError,
8308 "%c arg not in range(0x110000) "
8309 "(wide Python build)");
8310 return -1;
8311 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008312#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008313 if (x < 0 || x > 0xffff) {
8314 PyErr_SetString(PyExc_OverflowError,
8315 "%c arg not in range(0x10000) "
8316 "(narrow Python build)");
8317 return -1;
8318 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008319#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008320 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 }
8322 buf[1] = '\0';
8323 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008324
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008325 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008326 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008327 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008328 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329}
8330
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008331/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8332
Mark Dickinson18cfada2009-11-23 18:46:41 +00008333 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008334 chars are formatted. XXX This is a magic number. Each formatting
8335 routine does bounds checking to ensure no overflow, but a better
8336 solution may be to malloc a buffer of appropriate size for each
8337 format. For now, the current solution is sufficient.
8338*/
8339#define FORMATBUFLEN (size_t)120
8340
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008342 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343{
8344 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008345 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 int args_owned = 0;
8347 PyUnicodeObject *result = NULL;
8348 PyObject *dict = NULL;
8349 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008350
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008352 PyErr_BadInternalCall();
8353 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 }
8355 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008356 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 fmt = PyUnicode_AS_UNICODE(uformat);
8359 fmtcnt = PyUnicode_GET_SIZE(uformat);
8360
8361 reslen = rescnt = fmtcnt + 100;
8362 result = _PyUnicode_New(reslen);
8363 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008364 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365 res = PyUnicode_AS_UNICODE(result);
8366
8367 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008368 arglen = PyTuple_Size(args);
8369 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 }
8371 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008372 arglen = -1;
8373 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374 }
Benjamin Petersonda2c7eb2013-03-23 22:32:00 -05008375 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8376 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008377 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378
8379 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008380 if (*fmt != '%') {
8381 if (--rescnt < 0) {
8382 rescnt = fmtcnt + 100;
8383 reslen += rescnt;
8384 if (_PyUnicode_Resize(&result, reslen) < 0)
8385 goto onError;
8386 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8387 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008388 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008389 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008390 }
8391 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008392 /* Got a format specifier */
8393 int flags = 0;
8394 Py_ssize_t width = -1;
8395 int prec = -1;
8396 Py_UNICODE c = '\0';
8397 Py_UNICODE fill;
8398 int isnumok;
8399 PyObject *v = NULL;
8400 PyObject *temp = NULL;
8401 Py_UNICODE *pbuf;
8402 Py_UNICODE sign;
8403 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008404 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008405
8406 fmt++;
8407 if (*fmt == '(') {
8408 Py_UNICODE *keystart;
8409 Py_ssize_t keylen;
8410 PyObject *key;
8411 int pcount = 1;
8412
8413 if (dict == NULL) {
8414 PyErr_SetString(PyExc_TypeError,
8415 "format requires a mapping");
8416 goto onError;
8417 }
8418 ++fmt;
8419 --fmtcnt;
8420 keystart = fmt;
8421 /* Skip over balanced parentheses */
8422 while (pcount > 0 && --fmtcnt >= 0) {
8423 if (*fmt == ')')
8424 --pcount;
8425 else if (*fmt == '(')
8426 ++pcount;
8427 fmt++;
8428 }
8429 keylen = fmt - keystart - 1;
8430 if (fmtcnt < 0 || pcount > 0) {
8431 PyErr_SetString(PyExc_ValueError,
8432 "incomplete format key");
8433 goto onError;
8434 }
8435#if 0
8436 /* keys are converted to strings using UTF-8 and
8437 then looked up since Python uses strings to hold
8438 variables names etc. in its namespaces and we
8439 wouldn't want to break common idioms. */
8440 key = PyUnicode_EncodeUTF8(keystart,
8441 keylen,
8442 NULL);
8443#else
8444 key = PyUnicode_FromUnicode(keystart, keylen);
8445#endif
8446 if (key == NULL)
8447 goto onError;
8448 if (args_owned) {
8449 Py_DECREF(args);
8450 args_owned = 0;
8451 }
8452 args = PyObject_GetItem(dict, key);
8453 Py_DECREF(key);
8454 if (args == NULL) {
8455 goto onError;
8456 }
8457 args_owned = 1;
8458 arglen = -1;
8459 argidx = -2;
8460 }
8461 while (--fmtcnt >= 0) {
8462 switch (c = *fmt++) {
8463 case '-': flags |= F_LJUST; continue;
8464 case '+': flags |= F_SIGN; continue;
8465 case ' ': flags |= F_BLANK; continue;
8466 case '#': flags |= F_ALT; continue;
8467 case '0': flags |= F_ZERO; continue;
8468 }
8469 break;
8470 }
8471 if (c == '*') {
8472 v = getnextarg(args, arglen, &argidx);
8473 if (v == NULL)
8474 goto onError;
8475 if (!PyInt_Check(v)) {
8476 PyErr_SetString(PyExc_TypeError,
8477 "* wants int");
8478 goto onError;
8479 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008480 width = PyInt_AsSsize_t(v);
8481 if (width == -1 && PyErr_Occurred())
8482 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008483 if (width < 0) {
8484 flags |= F_LJUST;
8485 width = -width;
8486 }
8487 if (--fmtcnt >= 0)
8488 c = *fmt++;
8489 }
8490 else if (c >= '0' && c <= '9') {
8491 width = c - '0';
8492 while (--fmtcnt >= 0) {
8493 c = *fmt++;
8494 if (c < '0' || c > '9')
8495 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008496 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008497 PyErr_SetString(PyExc_ValueError,
8498 "width too big");
8499 goto onError;
8500 }
8501 width = width*10 + (c - '0');
8502 }
8503 }
8504 if (c == '.') {
8505 prec = 0;
8506 if (--fmtcnt >= 0)
8507 c = *fmt++;
8508 if (c == '*') {
8509 v = getnextarg(args, arglen, &argidx);
8510 if (v == NULL)
8511 goto onError;
8512 if (!PyInt_Check(v)) {
8513 PyErr_SetString(PyExc_TypeError,
8514 "* wants int");
8515 goto onError;
8516 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008517 prec = _PyInt_AsInt(v);
8518 if (prec == -1 && PyErr_Occurred())
8519 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008520 if (prec < 0)
8521 prec = 0;
8522 if (--fmtcnt >= 0)
8523 c = *fmt++;
8524 }
8525 else if (c >= '0' && c <= '9') {
8526 prec = c - '0';
8527 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008528 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008529 if (c < '0' || c > '9')
8530 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008531 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008532 PyErr_SetString(PyExc_ValueError,
8533 "prec too big");
8534 goto onError;
8535 }
8536 prec = prec*10 + (c - '0');
8537 }
8538 }
8539 } /* prec */
8540 if (fmtcnt >= 0) {
8541 if (c == 'h' || c == 'l' || c == 'L') {
8542 if (--fmtcnt >= 0)
8543 c = *fmt++;
8544 }
8545 }
8546 if (fmtcnt < 0) {
8547 PyErr_SetString(PyExc_ValueError,
8548 "incomplete format");
8549 goto onError;
8550 }
8551 if (c != '%') {
8552 v = getnextarg(args, arglen, &argidx);
8553 if (v == NULL)
8554 goto onError;
8555 }
8556 sign = 0;
8557 fill = ' ';
8558 switch (c) {
8559
8560 case '%':
8561 pbuf = formatbuf;
8562 /* presume that buffer length is at least 1 */
8563 pbuf[0] = '%';
8564 len = 1;
8565 break;
8566
8567 case 's':
8568 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008569 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008570 temp = v;
8571 Py_INCREF(temp);
8572 }
8573 else {
8574 PyObject *unicode;
8575 if (c == 's')
8576 temp = PyObject_Unicode(v);
8577 else
8578 temp = PyObject_Repr(v);
8579 if (temp == NULL)
8580 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008581 if (PyUnicode_Check(temp))
8582 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008583 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008584 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008585 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8586 PyString_GET_SIZE(temp),
8587 NULL,
8588 "strict");
8589 Py_DECREF(temp);
8590 temp = unicode;
8591 if (temp == NULL)
8592 goto onError;
8593 }
8594 else {
8595 Py_DECREF(temp);
8596 PyErr_SetString(PyExc_TypeError,
8597 "%s argument has non-string str()");
8598 goto onError;
8599 }
8600 }
8601 pbuf = PyUnicode_AS_UNICODE(temp);
8602 len = PyUnicode_GET_SIZE(temp);
8603 if (prec >= 0 && len > prec)
8604 len = prec;
8605 break;
8606
8607 case 'i':
8608 case 'd':
8609 case 'u':
8610 case 'o':
8611 case 'x':
8612 case 'X':
8613 if (c == 'i')
8614 c = 'd';
8615 isnumok = 0;
8616 if (PyNumber_Check(v)) {
8617 PyObject *iobj=NULL;
8618
8619 if (PyInt_Check(v) || (PyLong_Check(v))) {
8620 iobj = v;
8621 Py_INCREF(iobj);
8622 }
8623 else {
8624 iobj = PyNumber_Int(v);
8625 if (iobj==NULL) iobj = PyNumber_Long(v);
8626 }
8627 if (iobj!=NULL) {
8628 if (PyInt_Check(iobj)) {
8629 isnumok = 1;
8630 pbuf = formatbuf;
8631 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8632 flags, prec, c, iobj);
8633 Py_DECREF(iobj);
8634 if (len < 0)
8635 goto onError;
8636 sign = 1;
8637 }
8638 else if (PyLong_Check(iobj)) {
8639 isnumok = 1;
8640 temp = formatlong(iobj, flags, prec, c);
8641 Py_DECREF(iobj);
8642 if (!temp)
8643 goto onError;
8644 pbuf = PyUnicode_AS_UNICODE(temp);
8645 len = PyUnicode_GET_SIZE(temp);
8646 sign = 1;
8647 }
8648 else {
8649 Py_DECREF(iobj);
8650 }
8651 }
8652 }
8653 if (!isnumok) {
8654 PyErr_Format(PyExc_TypeError,
8655 "%%%c format: a number is required, "
8656 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8657 goto onError;
8658 }
8659 if (flags & F_ZERO)
8660 fill = '0';
8661 break;
8662
8663 case 'e':
8664 case 'E':
8665 case 'f':
8666 case 'F':
8667 case 'g':
8668 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008669 temp = formatfloat(v, flags, prec, c);
8670 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008671 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008672 pbuf = PyUnicode_AS_UNICODE(temp);
8673 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008674 sign = 1;
8675 if (flags & F_ZERO)
8676 fill = '0';
8677 break;
8678
8679 case 'c':
8680 pbuf = formatbuf;
8681 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8682 if (len < 0)
8683 goto onError;
8684 break;
8685
8686 default:
8687 PyErr_Format(PyExc_ValueError,
8688 "unsupported format character '%c' (0x%x) "
8689 "at index %zd",
8690 (31<=c && c<=126) ? (char)c : '?',
8691 (int)c,
8692 (Py_ssize_t)(fmt - 1 -
8693 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008694 goto onError;
8695 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008696 if (sign) {
8697 if (*pbuf == '-' || *pbuf == '+') {
8698 sign = *pbuf++;
8699 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008700 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008701 else if (flags & F_SIGN)
8702 sign = '+';
8703 else if (flags & F_BLANK)
8704 sign = ' ';
8705 else
8706 sign = 0;
8707 }
8708 if (width < len)
8709 width = len;
8710 if (rescnt - (sign != 0) < width) {
8711 reslen -= rescnt;
8712 rescnt = width + fmtcnt + 100;
8713 reslen += rescnt;
8714 if (reslen < 0) {
8715 Py_XDECREF(temp);
8716 PyErr_NoMemory();
8717 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008718 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008719 if (_PyUnicode_Resize(&result, reslen) < 0) {
8720 Py_XDECREF(temp);
8721 goto onError;
8722 }
8723 res = PyUnicode_AS_UNICODE(result)
8724 + reslen - rescnt;
8725 }
8726 if (sign) {
8727 if (fill != ' ')
8728 *res++ = sign;
8729 rescnt--;
8730 if (width > len)
8731 width--;
8732 }
8733 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8734 assert(pbuf[0] == '0');
8735 assert(pbuf[1] == c);
8736 if (fill != ' ') {
8737 *res++ = *pbuf++;
8738 *res++ = *pbuf++;
8739 }
8740 rescnt -= 2;
8741 width -= 2;
8742 if (width < 0)
8743 width = 0;
8744 len -= 2;
8745 }
8746 if (width > len && !(flags & F_LJUST)) {
8747 do {
8748 --rescnt;
8749 *res++ = fill;
8750 } while (--width > len);
8751 }
8752 if (fill == ' ') {
8753 if (sign)
8754 *res++ = sign;
8755 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8756 assert(pbuf[0] == '0');
8757 assert(pbuf[1] == c);
8758 *res++ = *pbuf++;
8759 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008760 }
8761 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008762 Py_UNICODE_COPY(res, pbuf, len);
8763 res += len;
8764 rescnt -= len;
8765 while (--width >= len) {
8766 --rescnt;
8767 *res++ = ' ';
8768 }
8769 if (dict && (argidx < arglen) && c != '%') {
8770 PyErr_SetString(PyExc_TypeError,
8771 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008772 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008773 goto onError;
8774 }
8775 Py_XDECREF(temp);
8776 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 } /* until end */
8778 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008779 PyErr_SetString(PyExc_TypeError,
8780 "not all arguments converted during string formatting");
8781 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 }
8783
Thomas Woutersa96affe2006-03-12 00:29:36 +00008784 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008785 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008787 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788 }
8789 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790 return (PyObject *)result;
8791
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008792 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 Py_XDECREF(result);
8794 Py_DECREF(uformat);
8795 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008796 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797 }
8798 return NULL;
8799}
8800
8801static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008802 (readbufferproc) unicode_buffer_getreadbuf,
8803 (writebufferproc) unicode_buffer_getwritebuf,
8804 (segcountproc) unicode_buffer_getsegcount,
8805 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008806};
8807
Jeremy Hylton938ace62002-07-17 16:30:39 +00008808static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008809unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8810
Tim Peters6d6c1a32001-08-02 04:15:00 +00008811static PyObject *
8812unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8813{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008814 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008815 static char *kwlist[] = {"string", "encoding", "errors", 0};
8816 char *encoding = NULL;
8817 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008818
Benjamin Peterson857ce152009-01-31 16:29:18 +00008819 if (type != &PyUnicode_Type)
8820 return unicode_subtype_new(type, args, kwds);
8821 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008822 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008823 return NULL;
8824 if (x == NULL)
8825 return (PyObject *)_PyUnicode_New(0);
8826 if (encoding == NULL && errors == NULL)
8827 return PyObject_Unicode(x);
8828 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008829 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008830}
8831
Guido van Rossume023fe02001-08-30 03:12:59 +00008832static PyObject *
8833unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8834{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008835 PyUnicodeObject *tmp, *pnew;
8836 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008837
Benjamin Peterson857ce152009-01-31 16:29:18 +00008838 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8839 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8840 if (tmp == NULL)
8841 return NULL;
8842 assert(PyUnicode_Check(tmp));
8843 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8844 if (pnew == NULL) {
8845 Py_DECREF(tmp);
8846 return NULL;
8847 }
8848 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8849 if (pnew->str == NULL) {
8850 _Py_ForgetReference((PyObject *)pnew);
8851 PyObject_Del(pnew);
8852 Py_DECREF(tmp);
8853 return PyErr_NoMemory();
8854 }
8855 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8856 pnew->length = n;
8857 pnew->hash = tmp->hash;
8858 Py_DECREF(tmp);
8859 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008860}
8861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008862PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008863 "unicode(object='') -> unicode object\n\
8864unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008865\n\
8866Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008867encoding defaults to the current default string encoding.\n\
8868errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008869
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008871 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008872 "unicode", /* tp_name */
8873 sizeof(PyUnicodeObject), /* tp_size */
8874 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008876 (destructor)unicode_dealloc, /* tp_dealloc */
8877 0, /* tp_print */
8878 0, /* tp_getattr */
8879 0, /* tp_setattr */
8880 0, /* tp_compare */
8881 unicode_repr, /* tp_repr */
8882 &unicode_as_number, /* tp_as_number */
8883 &unicode_as_sequence, /* tp_as_sequence */
8884 &unicode_as_mapping, /* tp_as_mapping */
8885 (hashfunc) unicode_hash, /* tp_hash*/
8886 0, /* tp_call*/
8887 (reprfunc) unicode_str, /* tp_str */
8888 PyObject_GenericGetAttr, /* tp_getattro */
8889 0, /* tp_setattro */
8890 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008891 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008892 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008893 unicode_doc, /* tp_doc */
8894 0, /* tp_traverse */
8895 0, /* tp_clear */
8896 PyUnicode_RichCompare, /* tp_richcompare */
8897 0, /* tp_weaklistoffset */
8898 0, /* tp_iter */
8899 0, /* tp_iternext */
8900 unicode_methods, /* tp_methods */
8901 0, /* tp_members */
8902 0, /* tp_getset */
8903 &PyBaseString_Type, /* tp_base */
8904 0, /* tp_dict */
8905 0, /* tp_descr_get */
8906 0, /* tp_descr_set */
8907 0, /* tp_dictoffset */
8908 0, /* tp_init */
8909 0, /* tp_alloc */
8910 unicode_new, /* tp_new */
8911 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912};
8913
8914/* Initialize the Unicode implementation */
8915
Thomas Wouters78890102000-07-22 19:25:51 +00008916void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008918 /* XXX - move this array to unicodectype.c ? */
8919 Py_UNICODE linebreak[] = {
8920 0x000A, /* LINE FEED */
8921 0x000D, /* CARRIAGE RETURN */
8922 0x001C, /* FILE SEPARATOR */
8923 0x001D, /* GROUP SEPARATOR */
8924 0x001E, /* RECORD SEPARATOR */
8925 0x0085, /* NEXT LINE */
8926 0x2028, /* LINE SEPARATOR */
8927 0x2029, /* PARAGRAPH SEPARATOR */
8928 };
8929
Fred Drakee4315f52000-05-09 19:53:39 +00008930 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008931 if (!unicode_empty) {
8932 unicode_empty = _PyUnicode_New(0);
8933 if (!unicode_empty)
8934 return;
8935 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008936
Guido van Rossumcacfc072002-05-24 19:01:59 +00008937 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008938 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008939
8940 /* initialize the linebreak bloom filter */
8941 bloom_linebreak = make_bloom_mask(
8942 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8943 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008944
8945 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008946
8947 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8948 Py_FatalError("Can't initialize field name iterator type");
8949
8950 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8951 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952}
8953
8954/* Finalize the Unicode implementation */
8955
Christian Heimes3b718a72008-02-14 12:47:33 +00008956int
8957PyUnicode_ClearFreeList(void)
8958{
8959 int freelist_size = numfree;
8960 PyUnicodeObject *u;
8961
8962 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008963 PyUnicodeObject *v = u;
8964 u = *(PyUnicodeObject **)u;
8965 if (v->str)
8966 PyObject_DEL(v->str);
8967 Py_XDECREF(v->defenc);
8968 PyObject_Del(v);
8969 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008970 }
8971 free_list = NULL;
8972 assert(numfree == 0);
8973 return freelist_size;
8974}
8975
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976void
Thomas Wouters78890102000-07-22 19:25:51 +00008977_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008979 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008981 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008982
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008983 for (i = 0; i < 256; i++)
8984 Py_CLEAR(unicode_latin1[i]);
8985
Christian Heimes3b718a72008-02-14 12:47:33 +00008986 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008988
Anthony Baxterac6bd462006-04-13 02:06:09 +00008989#ifdef __cplusplus
8990}
8991#endif