blob: 5ce9c88efa28ced6582ab2c6b26c6793ff1c6535 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Serhiy Storchakae822b032013-08-06 16:56:26 +0300550/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551 * by 'ptr', possibly combining surrogate pairs on narrow builds.
552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553 * that should be returned and 'end' pointing to the end of the buffer.
554 * ('end' is used on narrow builds to detect a lone surrogate at the
555 * end of the buffer that should be returned unchanged.)
556 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557 * The type of the returned char is always Py_UCS4.
558 *
559 * Note: the macro advances ptr to next char, so it might have side-effects
560 * (especially if used with other macros).
561 */
562
563/* helper macros used by _Py_UNICODE_NEXT */
564#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566/* Join two surrogate characters and return a single Py_UCS4 value. */
567#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
570
571#ifdef Py_UNICODE_WIDE
572#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
573#else
574#define _Py_UNICODE_NEXT(ptr, end) \
575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
578 (Py_UCS4)*(ptr)++)
579#endif
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581#ifdef HAVE_WCHAR_H
582
Mark Dickinson6b265f12009-03-18 16:07:26 +0000583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594{
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000643 PyErr_BadInternalCall();
644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 }
646
647 unicode = _PyUnicode_New(size);
648 if (!unicode)
649 return NULL;
650
651 /* Copy the wchar_t data into the new object */
652#ifdef HAVE_USABLE_WCHAR_T
653 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000654#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000656 register Py_UNICODE *u;
657 register Py_ssize_t i;
658 u = PyUnicode_AS_UNICODE(unicode);
659 for (i = size; i > 0; i--)
660 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662#endif
663
664 return (PyObject *)unicode;
665}
666
Mark Dickinson6b265f12009-03-18 16:07:26 +0000667#endif /* CONVERT_WCHAR_TO_SURROGATES */
668
669#undef CONVERT_WCHAR_TO_SURROGATES
670
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000671static void
672makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
673{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000674 *fmt++ = '%';
675 if (width) {
676 if (zeropad)
677 *fmt++ = '0';
678 fmt += sprintf(fmt, "%d", width);
679 }
680 if (precision)
681 fmt += sprintf(fmt, ".%d", precision);
682 if (longflag)
683 *fmt++ = 'l';
684 else if (size_tflag) {
685 char *f = PY_FORMAT_SIZE_T;
686 while (*f)
687 *fmt++ = *f++;
688 }
689 *fmt++ = c;
690 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000691}
692
693#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
694
695PyObject *
696PyUnicode_FromFormatV(const char *format, va_list vargs)
697{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000698 va_list count;
699 Py_ssize_t callcount = 0;
700 PyObject **callresults = NULL;
701 PyObject **callresult = NULL;
702 Py_ssize_t n = 0;
703 int width = 0;
704 int precision = 0;
705 int zeropad;
706 const char* f;
707 Py_UNICODE *s;
708 PyObject *string;
709 /* used by sprintf */
710 char buffer[21];
711 /* use abuffer instead of buffer, if we need more space
712 * (which can happen if there's a format specifier with width). */
713 char *abuffer = NULL;
714 char *realbuffer;
715 Py_ssize_t abuffersize = 0;
716 char fmt[60]; /* should be enough for %0width.precisionld */
717 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000718
719#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000720 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000721#else
722#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000723 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000724#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726#endif
727#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000728 /* step 1: count the number of %S/%R/%s format specifications
729 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
730 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000731 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000732 if (*f == '%') {
733 if (*(f+1)=='%')
734 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000735 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000736 ++callcount;
737 while (isdigit((unsigned)*f))
738 width = (width*10) + *f++ - '0';
739 while (*++f && *f != '%' && !isalpha((unsigned)*f))
740 ;
741 if (*f == 's')
742 ++callcount;
743 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000744 }
745 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000746 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000747 if (callcount) {
748 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
749 if (!callresults) {
750 PyErr_NoMemory();
751 return NULL;
752 }
753 callresult = callresults;
754 }
755 /* step 3: figure out how large a buffer we need */
756 for (f = format; *f; f++) {
757 if (*f == '%') {
758 const char* p = f;
759 width = 0;
760 while (isdigit((unsigned)*f))
761 width = (width*10) + *f++ - '0';
762 while (*++f && *f != '%' && !isalpha((unsigned)*f))
763 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000764
Benjamin Peterson857ce152009-01-31 16:29:18 +0000765 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
766 * they don't affect the amount of space we reserve.
767 */
768 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000769 (f[1] == 'd' || f[1] == 'u'))
770 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000771
Benjamin Peterson857ce152009-01-31 16:29:18 +0000772 switch (*f) {
773 case 'c':
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300774 {
775 int ordinal = va_arg(count, int);
776#ifdef Py_UNICODE_WIDE
777 if (ordinal < 0 || ordinal > 0x10ffff) {
778 PyErr_SetString(PyExc_OverflowError,
779 "%c arg not in range(0x110000) "
780 "(wide Python build)");
781 goto fail;
782 }
783#else
784 if (ordinal < 0 || ordinal > 0xffff) {
785 PyErr_SetString(PyExc_OverflowError,
786 "%c arg not in range(0x10000) "
787 "(narrow Python build)");
788 goto fail;
789 }
790#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000791 /* fall through... */
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300792 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000793 case '%':
794 n++;
795 break;
796 case 'd': case 'u': case 'i': case 'x':
797 (void) va_arg(count, int);
798 /* 20 bytes is enough to hold a 64-bit
799 integer. Decimal takes the most space.
800 This isn't enough for octal.
801 If a width is specified we need more
802 (which we allocate later). */
803 if (width < 20)
804 width = 20;
805 n += width;
806 if (abuffersize < width)
807 abuffersize = width;
808 break;
809 case 's':
810 {
811 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000812 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000813 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
814 if (!str)
815 goto fail;
816 n += PyUnicode_GET_SIZE(str);
817 /* Remember the str and switch to the next slot */
818 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000819 break;
820 }
821 case 'U':
822 {
823 PyObject *obj = va_arg(count, PyObject *);
824 assert(obj && PyUnicode_Check(obj));
825 n += PyUnicode_GET_SIZE(obj);
826 break;
827 }
828 case 'V':
829 {
830 PyObject *obj = va_arg(count, PyObject *);
831 const char *str = va_arg(count, const char *);
832 assert(obj || str);
833 assert(!obj || PyUnicode_Check(obj));
834 if (obj)
835 n += PyUnicode_GET_SIZE(obj);
836 else
837 n += strlen(str);
838 break;
839 }
840 case 'S':
841 {
842 PyObject *obj = va_arg(count, PyObject *);
843 PyObject *str;
844 assert(obj);
845 str = PyObject_Str(obj);
846 if (!str)
847 goto fail;
848 n += PyUnicode_GET_SIZE(str);
849 /* Remember the str and switch to the next slot */
850 *callresult++ = str;
851 break;
852 }
853 case 'R':
854 {
855 PyObject *obj = va_arg(count, PyObject *);
856 PyObject *repr;
857 assert(obj);
858 repr = PyObject_Repr(obj);
859 if (!repr)
860 goto fail;
861 n += PyUnicode_GET_SIZE(repr);
862 /* Remember the repr and switch to the next slot */
863 *callresult++ = repr;
864 break;
865 }
866 case 'p':
867 (void) va_arg(count, int);
868 /* maximum 64-bit pointer representation:
869 * 0xffffffffffffffff
870 * so 19 characters is enough.
871 * XXX I count 18 -- what's the extra for?
872 */
873 n += 19;
874 break;
875 default:
876 /* if we stumble upon an unknown
877 formatting code, copy the rest of
878 the format string to the output
879 string. (we cannot just skip the
880 code, since there's no way to know
881 what's in the argument list) */
882 n += strlen(p);
883 goto expand;
884 }
885 } else
886 n++;
887 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000888 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000889 if (abuffersize > 20) {
890 abuffer = PyObject_Malloc(abuffersize);
891 if (!abuffer) {
892 PyErr_NoMemory();
893 goto fail;
894 }
895 realbuffer = abuffer;
896 }
897 else
898 realbuffer = buffer;
899 /* step 4: fill the buffer */
900 /* Since we've analyzed how much space we need for the worst case,
901 we don't have to resize the string.
902 There can be no errors beyond this point. */
903 string = PyUnicode_FromUnicode(NULL, n);
904 if (!string)
905 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000906
Benjamin Peterson857ce152009-01-31 16:29:18 +0000907 s = PyUnicode_AS_UNICODE(string);
908 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000909
Benjamin Peterson857ce152009-01-31 16:29:18 +0000910 for (f = format; *f; f++) {
911 if (*f == '%') {
912 const char* p = f++;
913 int longflag = 0;
914 int size_tflag = 0;
915 zeropad = (*f == '0');
916 /* parse the width.precision part */
917 width = 0;
918 while (isdigit((unsigned)*f))
919 width = (width*10) + *f++ - '0';
920 precision = 0;
921 if (*f == '.') {
922 f++;
923 while (isdigit((unsigned)*f))
924 precision = (precision*10) + *f++ - '0';
925 }
926 /* handle the long flag, but only for %ld and %lu.
927 others can be added when necessary. */
928 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
929 longflag = 1;
930 ++f;
931 }
932 /* handle the size_t flag. */
933 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
934 size_tflag = 1;
935 ++f;
936 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000937
Benjamin Peterson857ce152009-01-31 16:29:18 +0000938 switch (*f) {
939 case 'c':
940 *s++ = va_arg(vargs, int);
941 break;
942 case 'd':
943 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
944 if (longflag)
945 sprintf(realbuffer, fmt, va_arg(vargs, long));
946 else if (size_tflag)
947 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
948 else
949 sprintf(realbuffer, fmt, va_arg(vargs, int));
950 appendstring(realbuffer);
951 break;
952 case 'u':
953 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
954 if (longflag)
955 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
956 else if (size_tflag)
957 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
958 else
959 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
960 appendstring(realbuffer);
961 break;
962 case 'i':
963 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
964 sprintf(realbuffer, fmt, va_arg(vargs, int));
965 appendstring(realbuffer);
966 break;
967 case 'x':
968 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
969 sprintf(realbuffer, fmt, va_arg(vargs, int));
970 appendstring(realbuffer);
971 break;
972 case 's':
973 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000974 /* unused, since we already have the result */
975 (void) va_arg(vargs, char *);
976 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
977 PyUnicode_GET_SIZE(*callresult));
978 s += PyUnicode_GET_SIZE(*callresult);
979 /* We're done with the unicode()/repr() => forget it */
980 Py_DECREF(*callresult);
981 /* switch to next unicode()/repr() result */
982 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000983 break;
984 }
985 case 'U':
986 {
987 PyObject *obj = va_arg(vargs, PyObject *);
988 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
989 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
990 s += size;
991 break;
992 }
993 case 'V':
994 {
995 PyObject *obj = va_arg(vargs, PyObject *);
996 const char *str = va_arg(vargs, const char *);
997 if (obj) {
998 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
999 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1000 s += size;
1001 } else {
1002 appendstring(str);
1003 }
1004 break;
1005 }
1006 case 'S':
1007 case 'R':
1008 {
1009 Py_UNICODE *ucopy;
1010 Py_ssize_t usize;
1011 Py_ssize_t upos;
1012 /* unused, since we already have the result */
1013 (void) va_arg(vargs, PyObject *);
1014 ucopy = PyUnicode_AS_UNICODE(*callresult);
1015 usize = PyUnicode_GET_SIZE(*callresult);
1016 for (upos = 0; upos<usize;)
1017 *s++ = ucopy[upos++];
1018 /* We're done with the unicode()/repr() => forget it */
1019 Py_DECREF(*callresult);
1020 /* switch to next unicode()/repr() result */
1021 ++callresult;
1022 break;
1023 }
1024 case 'p':
1025 sprintf(buffer, "%p", va_arg(vargs, void*));
1026 /* %p is ill-defined: ensure leading 0x. */
1027 if (buffer[1] == 'X')
1028 buffer[1] = 'x';
1029 else if (buffer[1] != 'x') {
1030 memmove(buffer+2, buffer, strlen(buffer)+1);
1031 buffer[0] = '0';
1032 buffer[1] = 'x';
1033 }
1034 appendstring(buffer);
1035 break;
1036 case '%':
1037 *s++ = '%';
1038 break;
1039 default:
1040 appendstring(p);
1041 goto end;
1042 }
1043 } else
1044 *s++ = *f;
1045 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001046
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001047 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001048 if (callresults)
1049 PyObject_Free(callresults);
1050 if (abuffer)
1051 PyObject_Free(abuffer);
1052 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1053 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001054 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001055 if (callresults) {
1056 PyObject **callresult2 = callresults;
1057 while (callresult2 < callresult) {
1058 Py_DECREF(*callresult2);
1059 ++callresult2;
1060 }
1061 PyObject_Free(callresults);
1062 }
1063 if (abuffer)
1064 PyObject_Free(abuffer);
1065 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001066}
1067
1068#undef appendstring
1069
1070PyObject *
1071PyUnicode_FromFormat(const char *format, ...)
1072{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001073 PyObject* ret;
1074 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001075
1076#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001077 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001078#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001079 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001080#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001081 ret = PyUnicode_FromFormatV(format, vargs);
1082 va_end(vargs);
1083 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001084}
1085
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001087 wchar_t *w,
1088 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089{
1090 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 PyErr_BadInternalCall();
1092 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001094
1095 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001097 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001098
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099#ifdef HAVE_USABLE_WCHAR_T
1100 memcpy(w, unicode->str, size * sizeof(wchar_t));
1101#else
1102 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001103 register Py_UNICODE *u;
1104 register Py_ssize_t i;
1105 u = PyUnicode_AS_UNICODE(unicode);
1106 for (i = size; i > 0; i--)
1107 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 }
1109#endif
1110
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001111 if (size > PyUnicode_GET_SIZE(unicode))
1112 return PyUnicode_GET_SIZE(unicode);
1113 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001114 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115}
1116
1117#endif
1118
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001119PyObject *PyUnicode_FromOrdinal(int ordinal)
1120{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001121 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122
1123#ifdef Py_UNICODE_WIDE
1124 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001125 PyErr_SetString(PyExc_ValueError,
1126 "unichr() arg not in range(0x110000) "
1127 "(wide Python build)");
1128 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001129 }
1130#else
1131 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001132 PyErr_SetString(PyExc_ValueError,
1133 "unichr() arg not in range(0x10000) "
1134 "(narrow Python build)");
1135 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001136 }
1137#endif
1138
Hye-Shik Chang40574832004-04-06 07:24:51 +00001139 s[0] = (Py_UNICODE)ordinal;
1140 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001141}
1142
Guido van Rossumd57fd912000-03-10 22:53:23 +00001143PyObject *PyUnicode_FromObject(register PyObject *obj)
1144{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001145 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001146 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001147 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001148 Py_INCREF(obj);
1149 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 }
1151 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001152 /* For a Unicode subtype that's not a Unicode object,
1153 return a true Unicode object with the same data. */
1154 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1155 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001156 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001157 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1158}
1159
1160PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001161 const char *encoding,
1162 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001163{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001164 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001165 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001167
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001169 PyErr_BadInternalCall();
1170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001172
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001173#if 0
1174 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001175 that no encodings is given and then redirect to
1176 PyObject_Unicode() which then applies the additional logic for
1177 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001178
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001179 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001180 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001181
1182 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001183 if (PyUnicode_Check(obj)) {
1184 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001185 PyErr_SetString(PyExc_TypeError,
1186 "decoding Unicode is not supported");
1187 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001188 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001189 return PyObject_Unicode(obj);
1190 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001191#else
1192 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001193 PyErr_SetString(PyExc_TypeError,
1194 "decoding Unicode is not supported");
1195 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001196 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001197#endif
1198
1199 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001200 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001201 s = PyString_AS_STRING(obj);
1202 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001203 }
Christian Heimes3497f942008-05-26 12:29:14 +00001204 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001205 /* Python 2.x specific */
1206 PyErr_Format(PyExc_TypeError,
1207 "decoding bytearray is not supported");
1208 return NULL;
1209 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001210 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001211 /* Overwrite the error message with something more useful in
1212 case of a TypeError. */
1213 if (PyErr_ExceptionMatches(PyExc_TypeError))
1214 PyErr_Format(PyExc_TypeError,
1215 "coercing to Unicode: need string or buffer, "
1216 "%.80s found",
1217 Py_TYPE(obj)->tp_name);
1218 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001219 }
Tim Petersced69f82003-09-16 20:30:58 +00001220
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001221 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001222 if (len == 0)
1223 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001224
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001225 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001226 return v;
1227
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001228 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230}
1231
1232PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001233 Py_ssize_t size,
1234 const char *encoding,
1235 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236{
1237 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001238
1239 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001240 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001241
1242 /* Shortcuts for common default encodings */
1243 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001245 else if (strcmp(encoding, "latin-1") == 0)
1246 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001247#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1248 else if (strcmp(encoding, "mbcs") == 0)
1249 return PyUnicode_DecodeMBCS(s, size, errors);
1250#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001251 else if (strcmp(encoding, "ascii") == 0)
1252 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253
1254 /* Decode via the codec registry */
1255 buffer = PyBuffer_FromMemory((void *)s, size);
1256 if (buffer == NULL)
1257 goto onError;
1258 unicode = PyCodec_Decode(buffer, encoding, errors);
1259 if (unicode == NULL)
1260 goto onError;
1261 if (!PyUnicode_Check(unicode)) {
1262 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001263 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001264 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 Py_DECREF(unicode);
1266 goto onError;
1267 }
1268 Py_DECREF(buffer);
1269 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001270
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001271 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272 Py_XDECREF(buffer);
1273 return NULL;
1274}
1275
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001276PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1277 const char *encoding,
1278 const char *errors)
1279{
1280 PyObject *v;
1281
1282 if (!PyUnicode_Check(unicode)) {
1283 PyErr_BadArgument();
1284 goto onError;
1285 }
1286
1287 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001288 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001289
1290 /* Decode via the codec registry */
1291 v = PyCodec_Decode(unicode, encoding, errors);
1292 if (v == NULL)
1293 goto onError;
1294 return v;
1295
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001296 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001297 return NULL;
1298}
1299
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001301 Py_ssize_t size,
1302 const char *encoding,
1303 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304{
1305 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001306
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 unicode = PyUnicode_FromUnicode(s, size);
1308 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001309 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1311 Py_DECREF(unicode);
1312 return v;
1313}
1314
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001315PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1316 const char *encoding,
1317 const char *errors)
1318{
1319 PyObject *v;
1320
1321 if (!PyUnicode_Check(unicode)) {
1322 PyErr_BadArgument();
1323 goto onError;
1324 }
1325
1326 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001327 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001328
1329 /* Encode via the codec registry */
1330 v = PyCodec_Encode(unicode, encoding, errors);
1331 if (v == NULL)
1332 goto onError;
1333 return v;
1334
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001335 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001336 return NULL;
1337}
1338
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1340 const char *encoding,
1341 const char *errors)
1342{
1343 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001344
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument();
1347 goto onError;
1348 }
Fred Drakee4315f52000-05-09 19:53:39 +00001349
Tim Petersced69f82003-09-16 20:30:58 +00001350 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001351 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001352
1353 /* Shortcuts for common default encodings */
1354 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001355 if (strcmp(encoding, "utf-8") == 0)
1356 return PyUnicode_AsUTF8String(unicode);
1357 else if (strcmp(encoding, "latin-1") == 0)
1358 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001359#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001360 else if (strcmp(encoding, "mbcs") == 0)
1361 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001362#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001363 else if (strcmp(encoding, "ascii") == 0)
1364 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366
1367 /* Encode via the codec registry */
1368 v = PyCodec_Encode(unicode, encoding, errors);
1369 if (v == NULL)
1370 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001371 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001373 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001374 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 Py_DECREF(v);
1376 goto onError;
1377 }
1378 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001379
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001380 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 return NULL;
1382}
1383
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001384PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001385 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001386{
1387 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1388
1389 if (v)
1390 return v;
1391 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1392 if (v && errors == NULL)
1393 ((PyUnicodeObject *)unicode)->defenc = v;
1394 return v;
1395}
1396
Guido van Rossumd57fd912000-03-10 22:53:23 +00001397Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1398{
1399 if (!PyUnicode_Check(unicode)) {
1400 PyErr_BadArgument();
1401 goto onError;
1402 }
1403 return PyUnicode_AS_UNICODE(unicode);
1404
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001405 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406 return NULL;
1407}
1408
Martin v. Löwis18e16552006-02-15 17:27:45 +00001409Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
1411 if (!PyUnicode_Check(unicode)) {
1412 PyErr_BadArgument();
1413 goto onError;
1414 }
1415 return PyUnicode_GET_SIZE(unicode);
1416
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001417 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418 return -1;
1419}
1420
Thomas Wouters78890102000-07-22 19:25:51 +00001421const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001422{
1423 return unicode_default_encoding;
1424}
1425
1426int PyUnicode_SetDefaultEncoding(const char *encoding)
1427{
1428 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001429
Fred Drakee4315f52000-05-09 19:53:39 +00001430 /* Make sure the encoding is valid. As side effect, this also
1431 loads the encoding into the codec registry cache. */
1432 v = _PyCodec_Lookup(encoding);
1433 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001434 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001435 Py_DECREF(v);
1436 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001437 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001438 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001439 return 0;
1440
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001442 return -1;
1443}
1444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445/* error handling callback helper:
1446 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001447 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001448 and adjust various state variables.
1449 return 0 on success, -1 on error
1450*/
1451
1452static
1453int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001454 const char *encoding, const char *reason,
1455 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1456 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1457 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001458{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001459 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001460
1461 PyObject *restuple = NULL;
1462 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001463 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1464 Py_ssize_t requiredsize;
1465 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001466 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001468 int res = -1;
1469
1470 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001471 *errorHandler = PyCodec_LookupError(errors);
1472 if (*errorHandler == NULL)
1473 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001474 }
1475
1476 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001477 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001478 encoding, input, insize, *startinpos, *endinpos, reason);
1479 if (*exceptionObject == NULL)
1480 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001481 }
1482 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001483 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1484 goto onError;
1485 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1486 goto onError;
1487 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1488 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001489 }
1490
1491 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1492 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001493 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001495 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001496 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001497 }
1498 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001499 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001501 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001502 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001503 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1504 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001505 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001506
1507 /* need more space? (at least enough for what we
1508 have+the replacement+the rest of the string (starting
1509 at the new input position), so we won't have to check space
1510 when there are no errors in the rest of the string) */
1511 repptr = PyUnicode_AS_UNICODE(repunicode);
1512 repsize = PyUnicode_GET_SIZE(repunicode);
1513 requiredsize = *outpos + repsize + insize-newpos;
1514 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001515 if (requiredsize<2*outsize)
1516 requiredsize = 2*outsize;
1517 if (_PyUnicode_Resize(output, requiredsize) < 0)
1518 goto onError;
1519 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001520 }
1521 *endinpos = newpos;
1522 *inptr = input + newpos;
1523 Py_UNICODE_COPY(*outptr, repptr, repsize);
1524 *outptr += repsize;
1525 *outpos += repsize;
1526 /* we made it! */
1527 res = 0;
1528
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001529 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 Py_XDECREF(restuple);
1531 return res;
1532}
1533
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534/* --- UTF-7 Codec -------------------------------------------------------- */
1535
Antoine Pitrou653dece2009-05-04 18:32:32 +00001536/* See RFC2152 for details. We encode conservatively and decode liberally. */
1537
1538/* Three simple macros defining base-64. */
1539
1540/* Is c a base-64 character? */
1541
1542#define IS_BASE64(c) \
1543 (isalnum(c) || (c) == '+' || (c) == '/')
1544
1545/* given that c is a base-64 character, what is its base-64 value? */
1546
1547#define FROM_BASE64(c) \
1548 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1549 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1550 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1551 (c) == '+' ? 62 : 63)
1552
1553/* What is the base-64 character of the bottom 6 bits of n? */
1554
1555#define TO_BASE64(n) \
1556 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1557
1558/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1559 * decoded as itself. We are permissive on decoding; the only ASCII
1560 * byte not decoding to itself is the + which begins a base64
1561 * string. */
1562
1563#define DECODE_DIRECT(c) \
1564 ((c) <= 127 && (c) != '+')
1565
1566/* The UTF-7 encoder treats ASCII characters differently according to
1567 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1568 * the above). See RFC2152. This array identifies these different
1569 * sets:
1570 * 0 : "Set D"
1571 * alphanumeric and '(),-./:?
1572 * 1 : "Set O"
1573 * !"#$%&*;<=>@[]^_`{|}
1574 * 2 : "whitespace"
1575 * ht nl cr sp
1576 * 3 : special (must be base64 encoded)
1577 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1578 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579
Tim Petersced69f82003-09-16 20:30:58 +00001580static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001581char utf7_category[128] = {
1582/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1583 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1584/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1585 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1586/* sp ! " # $ % & ' ( ) * + , - . / */
1587 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1588/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1589 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1590/* @ A B C D E F G H I J K L M N O */
1591 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1592/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1594/* ` a b c d e f g h i j k l m n o */
1595 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1596/* p q r s t u v w x y z { | } ~ del */
1597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598};
1599
Antoine Pitrou653dece2009-05-04 18:32:32 +00001600/* ENCODE_DIRECT: this character should be encoded as itself. The
1601 * answer depends on whether we are encoding set O as itself, and also
1602 * on whether we are encoding whitespace as itself. RFC2152 makes it
1603 * clear that the answers to these questions vary between
1604 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001605
Antoine Pitrou653dece2009-05-04 18:32:32 +00001606#define ENCODE_DIRECT(c, directO, directWS) \
1607 ((c) < 128 && (c) > 0 && \
1608 ((utf7_category[(c)] == 0) || \
1609 (directWS && (utf7_category[(c)] == 2)) || \
1610 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001612PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001613 Py_ssize_t size,
1614 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001615{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001616 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1617}
1618
Antoine Pitrou653dece2009-05-04 18:32:32 +00001619/* The decoder. The only state we preserve is our read position,
1620 * i.e. how many characters we have consumed. So if we end in the
1621 * middle of a shift sequence we have to back off the read position
1622 * and the output to the beginning of the sequence, otherwise we lose
1623 * all the shift state (seen bits, number of bits seen, high
1624 * surrogate). */
1625
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001626PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001627 Py_ssize_t size,
1628 const char *errors,
1629 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001630{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001631 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001632 Py_ssize_t startinpos;
1633 Py_ssize_t endinpos;
1634 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 const char *e;
1636 PyUnicodeObject *unicode;
1637 Py_UNICODE *p;
1638 const char *errmsg = "";
1639 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001640 Py_UNICODE *shiftOutStart;
1641 unsigned int base64bits = 0;
1642 unsigned long base64buffer = 0;
1643 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 PyObject *errorHandler = NULL;
1645 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001646
1647 unicode = _PyUnicode_New(size);
1648 if (!unicode)
1649 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001650 if (size == 0) {
1651 if (consumed)
1652 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001654 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001655
1656 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001657 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658 e = s + size;
1659
1660 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001661 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662
Antoine Pitrou653dece2009-05-04 18:32:32 +00001663 if (inShift) { /* in a base-64 section */
1664 if (IS_BASE64(ch)) { /* consume a base-64 character */
1665 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1666 base64bits += 6;
1667 s++;
1668 if (base64bits >= 16) {
1669 /* we have enough bits for a UTF-16 value */
1670 Py_UNICODE outCh = (Py_UNICODE)
1671 (base64buffer >> (base64bits-16));
1672 base64bits -= 16;
1673 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001674 assert(outCh <= 0xffff);
Antoine Pitrou653dece2009-05-04 18:32:32 +00001675 if (surrogate) {
1676 /* expecting a second surrogate */
1677 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1678#ifdef Py_UNICODE_WIDE
1679 *p++ = (((surrogate & 0x3FF)<<10)
1680 | (outCh & 0x3FF)) + 0x10000;
1681#else
1682 *p++ = surrogate;
1683 *p++ = outCh;
1684#endif
1685 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001686 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001687 }
1688 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001689 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001691 }
1692 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001693 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001694 /* first surrogate */
1695 surrogate = outCh;
1696 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001697 else {
1698 *p++ = outCh;
1699 }
1700 }
1701 }
1702 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001703 inShift = 0;
1704 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001706 *p++ = surrogate;
1707 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001709 if (base64bits > 0) { /* left-over bits */
1710 if (base64bits >= 6) {
1711 /* We've seen at least one base-64 character */
1712 errmsg = "partial character in shift sequence";
1713 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001715 else {
1716 /* Some bits remain; they should be zero */
1717 if (base64buffer != 0) {
1718 errmsg = "non-zero padding bits in shift sequence";
1719 goto utf7Error;
1720 }
1721 }
1722 }
1723 if (ch != '-') {
1724 /* '-' is absorbed; other terminating
1725 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001726 *p++ = ch;
1727 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001728 }
1729 }
1730 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001731 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001732 s++; /* consume '+' */
1733 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001734 s++;
1735 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 }
1737 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001739 shiftOutStart = p;
1740 base64bits = 0;
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001741 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742 }
1743 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001744 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001745 *p++ = ch;
1746 s++;
1747 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001748 else {
1749 startinpos = s-starts;
1750 s++;
1751 errmsg = "unexpected special character";
1752 goto utf7Error;
1753 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001754 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001755utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001756 outpos = p-PyUnicode_AS_UNICODE(unicode);
1757 endinpos = s-starts;
1758 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001759 errors, &errorHandler,
1760 "utf7", errmsg,
1761 starts, size, &startinpos, &endinpos, &exc, &s,
1762 &unicode, &outpos, &p))
1763 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001764 }
1765
Antoine Pitrou653dece2009-05-04 18:32:32 +00001766 /* end of string */
1767
1768 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1769 /* if we're in an inconsistent state, that's an error */
1770 if (surrogate ||
1771 (base64bits >= 6) ||
1772 (base64bits > 0 && base64buffer != 0)) {
1773 outpos = p-PyUnicode_AS_UNICODE(unicode);
1774 endinpos = size;
1775 if (unicode_decode_call_errorhandler(
1776 errors, &errorHandler,
1777 "utf7", "unterminated shift sequence",
1778 starts, size, &startinpos, &endinpos, &exc, &s,
1779 &unicode, &outpos, &p))
1780 goto onError;
1781 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001782 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001783
1784 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001785 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001786 if (inShift) {
1787 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001788 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001789 }
1790 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001791 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001792 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001793 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001794
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001795 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001796 goto onError;
1797
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001798 Py_XDECREF(errorHandler);
1799 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001800 return (PyObject *)unicode;
1801
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001802 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 Py_XDECREF(errorHandler);
1804 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001805 Py_DECREF(unicode);
1806 return NULL;
1807}
1808
1809
1810PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001811 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001812 int base64SetO,
1813 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001814 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001815{
1816 PyObject *v;
1817 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001818 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001819 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001820 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001821 unsigned int base64bits = 0;
1822 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001823 char * out;
1824 char * start;
1825
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001826 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001827 return PyErr_NoMemory();
1828
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001829 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001830 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001831
Antoine Pitrou653dece2009-05-04 18:32:32 +00001832 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001833 if (v == NULL)
1834 return NULL;
1835
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001836 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001837 for (;i < size; ++i) {
1838 Py_UNICODE ch = s[i];
1839
Antoine Pitrou653dece2009-05-04 18:32:32 +00001840 if (inShift) {
1841 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1842 /* shifting out */
1843 if (base64bits) { /* output remaining bits */
1844 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1845 base64buffer = 0;
1846 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001847 }
1848 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001849 /* Characters not in the BASE64 set implicitly unshift the sequence
1850 so no '-' is required, except if the character is itself a '-' */
1851 if (IS_BASE64(ch) || ch == '-') {
1852 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001854 *out++ = (char) ch;
1855 }
1856 else {
1857 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001858 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001859 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001860 else { /* not in a shift sequence */
1861 if (ch == '+') {
1862 *out++ = '+';
1863 *out++ = '-';
1864 }
1865 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1866 *out++ = (char) ch;
1867 }
1868 else {
1869 *out++ = '+';
1870 inShift = 1;
1871 goto encode_char;
1872 }
1873 }
1874 continue;
1875encode_char:
1876#ifdef Py_UNICODE_WIDE
1877 if (ch >= 0x10000) {
1878 /* code first surrogate */
1879 base64bits += 16;
1880 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1881 while (base64bits >= 6) {
1882 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1883 base64bits -= 6;
1884 }
1885 /* prepare second surrogate */
1886 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1887 }
1888#endif
1889 base64bits += 16;
1890 base64buffer = (base64buffer << 16) | ch;
1891 while (base64bits >= 6) {
1892 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1893 base64bits -= 6;
1894 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001895 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001896 if (base64bits)
1897 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1898 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001899 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001900
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001901 if (_PyString_Resize(&v, out - start))
1902 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001903 return v;
1904}
1905
Antoine Pitrou653dece2009-05-04 18:32:32 +00001906#undef IS_BASE64
1907#undef FROM_BASE64
1908#undef TO_BASE64
1909#undef DECODE_DIRECT
1910#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001911
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912/* --- UTF-8 Codec -------------------------------------------------------- */
1913
Tim Petersced69f82003-09-16 20:30:58 +00001914static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001916 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1917 illegal prefix. See RFC 3629 for details */
1918 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1919 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001920 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1922 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1923 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1924 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001925 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1926 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1928 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001929 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1930 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1931 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1932 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1933 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934};
1935
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001937 Py_ssize_t size,
1938 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939{
Walter Dörwald69652032004-09-07 20:24:22 +00001940 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1941}
1942
1943PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001944 Py_ssize_t size,
1945 const char *errors,
1946 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001947{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001948 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001950 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001951 Py_ssize_t startinpos;
1952 Py_ssize_t endinpos;
1953 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 const char *e;
1955 PyUnicodeObject *unicode;
1956 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001957 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001958 PyObject *errorHandler = NULL;
1959 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960
1961 /* Note: size will always be longer than the resulting Unicode
1962 character count */
1963 unicode = _PyUnicode_New(size);
1964 if (!unicode)
1965 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001966 if (size == 0) {
1967 if (consumed)
1968 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971
1972 /* Unpack UTF-8 encoded data */
1973 p = unicode->str;
1974 e = s + size;
1975
1976 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001977 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978
1979 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001980 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 s++;
1982 continue;
1983 }
1984
1985 n = utf8_code_length[ch];
1986
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001987 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001988 if (consumed)
1989 break;
1990 else {
1991 errmsg = "unexpected end of data";
1992 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001993 endinpos = startinpos+1;
1994 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1995 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001996 goto utf8Error;
1997 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999
2000 switch (n) {
2001
2002 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002003 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002004 startinpos = s-starts;
2005 endinpos = startinpos+1;
2006 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007
2008 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002009 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002010 startinpos = s-starts;
2011 endinpos = startinpos+1;
2012 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013
2014 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002015 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00002016 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002017 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002018 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002019 goto utf8Error;
2020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002022 assert ((ch > 0x007F) && (ch <= 0x07FF));
2023 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024 break;
2025
2026 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002027 /* XXX: surrogates shouldn't be valid UTF-8!
2028 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2029 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2030 Uncomment the 2 lines below to make them invalid,
2031 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00002032 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002033 (s[2] & 0xc0) != 0x80 ||
2034 ((unsigned char)s[0] == 0xE0 &&
2035 (unsigned char)s[1] < 0xA0)/* ||
2036 ((unsigned char)s[0] == 0xED &&
2037 (unsigned char)s[1] > 0x9F)*/) {
2038 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002039 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002040 endinpos = startinpos + 1;
2041
2042 /* if s[1] first two bits are 1 and 0, then the invalid
2043 continuation byte is s[2], so increment endinpos by 1,
2044 if not, s[1] is invalid and endinpos doesn't need to
2045 be incremented. */
2046 if ((s[1] & 0xC0) == 0x80)
2047 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002048 goto utf8Error;
2049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002051 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2052 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002053 break;
2054
2055 case 4:
2056 if ((s[1] & 0xc0) != 0x80 ||
2057 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002058 (s[3] & 0xc0) != 0x80 ||
2059 ((unsigned char)s[0] == 0xF0 &&
2060 (unsigned char)s[1] < 0x90) ||
2061 ((unsigned char)s[0] == 0xF4 &&
2062 (unsigned char)s[1] > 0x8F)) {
2063 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002064 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002065 endinpos = startinpos + 1;
2066 if ((s[1] & 0xC0) == 0x80) {
2067 endinpos++;
2068 if ((s[2] & 0xC0) == 0x80)
2069 endinpos++;
2070 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002071 goto utf8Error;
2072 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002073 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002074 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2075 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2076
Fredrik Lundh8f455852001-06-27 18:59:43 +00002077#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002078 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002079#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002080 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002081
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002082 /* translate from 10000..10FFFF to 0..FFFF */
2083 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002084
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002085 /* high surrogate = top 10 bits added to D800 */
2086 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002087
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002088 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002089 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002090#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 }
2093 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002094 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002095
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002096 utf8Error:
2097 outpos = p-PyUnicode_AS_UNICODE(unicode);
2098 if (unicode_decode_call_errorhandler(
2099 errors, &errorHandler,
2100 "utf8", errmsg,
2101 starts, size, &startinpos, &endinpos, &exc, &s,
2102 &unicode, &outpos, &p))
2103 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 }
Walter Dörwald69652032004-09-07 20:24:22 +00002105 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002106 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107
2108 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002109 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 goto onError;
2111
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002112 Py_XDECREF(errorHandler);
2113 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 return (PyObject *)unicode;
2115
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002116 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117 Py_XDECREF(errorHandler);
2118 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119 Py_DECREF(unicode);
2120 return NULL;
2121}
2122
Tim Peters602f7402002-04-27 18:03:26 +00002123/* Allocation strategy: if the string is short, convert into a stack buffer
2124 and allocate exactly as much space needed at the end. Else allocate the
2125 maximum possible needed (4 result bytes per Unicode character), and return
2126 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002127*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002128PyObject *
2129PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002130 Py_ssize_t size,
2131 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132{
Tim Peters602f7402002-04-27 18:03:26 +00002133#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002134
Martin v. Löwis18e16552006-02-15 17:27:45 +00002135 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002136 PyObject *v; /* result string object */
2137 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002138 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002139 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002140 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002141
Tim Peters602f7402002-04-27 18:03:26 +00002142 assert(s != NULL);
2143 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144
Tim Peters602f7402002-04-27 18:03:26 +00002145 if (size <= MAX_SHORT_UNICHARS) {
2146 /* Write into the stack buffer; nallocated can't overflow.
2147 * At the end, we'll allocate exactly as much heap space as it
2148 * turns out we need.
2149 */
2150 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2151 v = NULL; /* will allocate after we're done */
2152 p = stackbuf;
2153 }
2154 else {
2155 /* Overallocate on the heap, and give the excess back at the end. */
2156 nallocated = size * 4;
2157 if (nallocated / 4 != size) /* overflow! */
2158 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002159 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002160 if (v == NULL)
2161 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002162 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002163 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002164
Tim Peters602f7402002-04-27 18:03:26 +00002165 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002166 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002167
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002168 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002169 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002171
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002173 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002174 *p++ = (char)(0xc0 | (ch >> 6));
2175 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002176 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002177 else {
Tim Peters602f7402002-04-27 18:03:26 +00002178 /* Encode UCS2 Unicode ordinals */
2179 if (ch < 0x10000) {
2180 /* Special case: check for high surrogate */
2181 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2182 Py_UCS4 ch2 = s[i];
2183 /* Check for low surrogate and combine the two to
2184 form a UCS4 value */
2185 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002186 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002187 i++;
2188 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002189 }
Tim Peters602f7402002-04-27 18:03:26 +00002190 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002191 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002192 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002193 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2194 *p++ = (char)(0x80 | (ch & 0x3f));
2195 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002196 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002197 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002198 /* Encode UCS4 Unicode ordinals */
2199 *p++ = (char)(0xf0 | (ch >> 18));
2200 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2201 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2202 *p++ = (char)(0x80 | (ch & 0x3f));
2203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002205
Tim Peters602f7402002-04-27 18:03:26 +00002206 if (v == NULL) {
2207 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002208 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002209 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002210 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002211 }
2212 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002213 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002214 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002215 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002216 if (_PyString_Resize(&v, nneeded))
2217 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002220
Tim Peters602f7402002-04-27 18:03:26 +00002221#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222}
2223
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2225{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 if (!PyUnicode_Check(unicode)) {
2227 PyErr_BadArgument();
2228 return NULL;
2229 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002230 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002231 PyUnicode_GET_SIZE(unicode),
2232 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233}
2234
Walter Dörwald6e390802007-08-17 16:41:28 +00002235/* --- UTF-32 Codec ------------------------------------------------------- */
2236
2237PyObject *
2238PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002239 Py_ssize_t size,
2240 const char *errors,
2241 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002242{
2243 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2244}
2245
2246PyObject *
2247PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002248 Py_ssize_t size,
2249 const char *errors,
2250 int *byteorder,
2251 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002252{
2253 const char *starts = s;
2254 Py_ssize_t startinpos;
2255 Py_ssize_t endinpos;
2256 Py_ssize_t outpos;
2257 PyUnicodeObject *unicode;
2258 Py_UNICODE *p;
2259#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002260 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002261 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002262#else
2263 const int pairs = 0;
2264#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002265 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002266 int bo = 0; /* assume native ordering by default */
2267 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002268 /* Offsets from q for retrieving bytes in the right order. */
2269#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2270 int iorder[] = {0, 1, 2, 3};
2271#else
2272 int iorder[] = {3, 2, 1, 0};
2273#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002274 PyObject *errorHandler = NULL;
2275 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002276
Walter Dörwald6e390802007-08-17 16:41:28 +00002277 q = (unsigned char *)s;
2278 e = q + size;
2279
2280 if (byteorder)
2281 bo = *byteorder;
2282
2283 /* Check for BOM marks (U+FEFF) in the input and adjust current
2284 byte order setting accordingly. In native mode, the leading BOM
2285 mark is skipped, in all other modes, it is copied to the output
2286 stream as-is (giving a ZWNBSP character). */
2287 if (bo == 0) {
2288 if (size >= 4) {
2289 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002290 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002291#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002292 if (bom == 0x0000FEFF) {
2293 q += 4;
2294 bo = -1;
2295 }
2296 else if (bom == 0xFFFE0000) {
2297 q += 4;
2298 bo = 1;
2299 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002300#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002301 if (bom == 0x0000FEFF) {
2302 q += 4;
2303 bo = 1;
2304 }
2305 else if (bom == 0xFFFE0000) {
2306 q += 4;
2307 bo = -1;
2308 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002309#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002310 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002311 }
2312
2313 if (bo == -1) {
2314 /* force LE */
2315 iorder[0] = 0;
2316 iorder[1] = 1;
2317 iorder[2] = 2;
2318 iorder[3] = 3;
2319 }
2320 else if (bo == 1) {
2321 /* force BE */
2322 iorder[0] = 3;
2323 iorder[1] = 2;
2324 iorder[2] = 1;
2325 iorder[3] = 0;
2326 }
2327
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002328 /* On narrow builds we split characters outside the BMP into two
2329 codepoints => count how much extra space we need. */
2330#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002331 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002332 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2333 pairs++;
2334#endif
2335
2336 /* This might be one to much, because of a BOM */
2337 unicode = _PyUnicode_New((size+3)/4+pairs);
2338 if (!unicode)
2339 return NULL;
2340 if (size == 0)
2341 return (PyObject *)unicode;
2342
2343 /* Unpack UTF-32 encoded data */
2344 p = unicode->str;
2345
Walter Dörwald6e390802007-08-17 16:41:28 +00002346 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002347 Py_UCS4 ch;
2348 /* remaining bytes at the end? (size should be divisible by 4) */
2349 if (e-q<4) {
2350 if (consumed)
2351 break;
2352 errmsg = "truncated data";
2353 startinpos = ((const char *)q)-starts;
2354 endinpos = ((const char *)e)-starts;
2355 goto utf32Error;
2356 /* The remaining input chars are ignored if the callback
2357 chooses to skip the input */
2358 }
2359 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2360 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002361
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002362 if (ch >= 0x110000)
2363 {
2364 errmsg = "codepoint not in range(0x110000)";
2365 startinpos = ((const char *)q)-starts;
2366 endinpos = startinpos+4;
2367 goto utf32Error;
2368 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002369#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002370 if (ch >= 0x10000)
2371 {
2372 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2373 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2374 }
2375 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002376#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002377 *p++ = ch;
2378 q += 4;
2379 continue;
2380 utf32Error:
2381 outpos = p-PyUnicode_AS_UNICODE(unicode);
2382 if (unicode_decode_call_errorhandler(
2383 errors, &errorHandler,
2384 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002385 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002386 &unicode, &outpos, &p))
2387 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002388 }
2389
2390 if (byteorder)
2391 *byteorder = bo;
2392
2393 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002394 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002395
2396 /* Adjust length */
2397 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2398 goto onError;
2399
2400 Py_XDECREF(errorHandler);
2401 Py_XDECREF(exc);
2402 return (PyObject *)unicode;
2403
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002404 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002405 Py_DECREF(unicode);
2406 Py_XDECREF(errorHandler);
2407 Py_XDECREF(exc);
2408 return NULL;
2409}
2410
2411PyObject *
2412PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002413 Py_ssize_t size,
2414 const char *errors,
2415 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002416{
2417 PyObject *v;
2418 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002419 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002420#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002421 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002422#else
2423 const int pairs = 0;
2424#endif
2425 /* Offsets from p for storing byte pairs in the right order. */
2426#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2427 int iorder[] = {0, 1, 2, 3};
2428#else
2429 int iorder[] = {3, 2, 1, 0};
2430#endif
2431
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002432#define STORECHAR(CH) \
2433 do { \
2434 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2435 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2436 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2437 p[iorder[0]] = (CH) & 0xff; \
2438 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002439 } while(0)
2440
2441 /* In narrow builds we can output surrogate pairs as one codepoint,
2442 so we need less space. */
2443#ifndef Py_UNICODE_WIDE
2444 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002445 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2446 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2447 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002448#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002449 nsize = (size - pairs + (byteorder == 0));
2450 bytesize = nsize * 4;
2451 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002452 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002453 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002454 if (v == NULL)
2455 return NULL;
2456
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002457 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002458 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002459 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002460 if (size == 0)
2461 return v;
2462
2463 if (byteorder == -1) {
2464 /* force LE */
2465 iorder[0] = 0;
2466 iorder[1] = 1;
2467 iorder[2] = 2;
2468 iorder[3] = 3;
2469 }
2470 else if (byteorder == 1) {
2471 /* force BE */
2472 iorder[0] = 3;
2473 iorder[1] = 2;
2474 iorder[2] = 1;
2475 iorder[3] = 0;
2476 }
2477
2478 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002479 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002480#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002481 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2482 Py_UCS4 ch2 = *s;
2483 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2484 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2485 s++;
2486 size--;
2487 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002488 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002489#endif
2490 STORECHAR(ch);
2491 }
2492 return v;
2493#undef STORECHAR
2494}
2495
2496PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2497{
2498 if (!PyUnicode_Check(unicode)) {
2499 PyErr_BadArgument();
2500 return NULL;
2501 }
2502 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002503 PyUnicode_GET_SIZE(unicode),
2504 NULL,
2505 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002506}
2507
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508/* --- UTF-16 Codec ------------------------------------------------------- */
2509
Tim Peters772747b2001-08-09 22:21:55 +00002510PyObject *
2511PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002512 Py_ssize_t size,
2513 const char *errors,
2514 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515{
Walter Dörwald69652032004-09-07 20:24:22 +00002516 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2517}
2518
2519PyObject *
2520PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002521 Py_ssize_t size,
2522 const char *errors,
2523 int *byteorder,
2524 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002525{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002526 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002527 Py_ssize_t startinpos;
2528 Py_ssize_t endinpos;
2529 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530 PyUnicodeObject *unicode;
2531 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002532 const unsigned char *q, *e;
2533 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002534 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002535 /* Offsets from q for retrieving byte pairs in the right order. */
2536#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2537 int ihi = 1, ilo = 0;
2538#else
2539 int ihi = 0, ilo = 1;
2540#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002541 PyObject *errorHandler = NULL;
2542 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543
2544 /* Note: size will always be longer than the resulting Unicode
2545 character count */
2546 unicode = _PyUnicode_New(size);
2547 if (!unicode)
2548 return NULL;
2549 if (size == 0)
2550 return (PyObject *)unicode;
2551
2552 /* Unpack UTF-16 encoded data */
2553 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002554 q = (unsigned char *)s;
2555 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556
2557 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002558 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002560 /* Check for BOM marks (U+FEFF) in the input and adjust current
2561 byte order setting accordingly. In native mode, the leading BOM
2562 mark is skipped, in all other modes, it is copied to the output
2563 stream as-is (giving a ZWNBSP character). */
2564 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002565 if (size >= 2) {
2566 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002567#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002568 if (bom == 0xFEFF) {
2569 q += 2;
2570 bo = -1;
2571 }
2572 else if (bom == 0xFFFE) {
2573 q += 2;
2574 bo = 1;
2575 }
Tim Petersced69f82003-09-16 20:30:58 +00002576#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002577 if (bom == 0xFEFF) {
2578 q += 2;
2579 bo = 1;
2580 }
2581 else if (bom == 0xFFFE) {
2582 q += 2;
2583 bo = -1;
2584 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002585#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002586 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588
Tim Peters772747b2001-08-09 22:21:55 +00002589 if (bo == -1) {
2590 /* force LE */
2591 ihi = 1;
2592 ilo = 0;
2593 }
2594 else if (bo == 1) {
2595 /* force BE */
2596 ihi = 0;
2597 ilo = 1;
2598 }
2599
2600 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002601 Py_UNICODE ch;
2602 /* remaining bytes at the end? (size should be even) */
2603 if (e-q<2) {
2604 if (consumed)
2605 break;
2606 errmsg = "truncated data";
2607 startinpos = ((const char *)q)-starts;
2608 endinpos = ((const char *)e)-starts;
2609 goto utf16Error;
2610 /* The remaining input chars are ignored if the callback
2611 chooses to skip the input */
2612 }
2613 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002614
Benjamin Peterson857ce152009-01-31 16:29:18 +00002615 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002616
2617 if (ch < 0xD800 || ch > 0xDFFF) {
2618 *p++ = ch;
2619 continue;
2620 }
2621
2622 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002623 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002624 q -= 2;
2625 if (consumed)
2626 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002627 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002628 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002629 endinpos = ((const char *)e)-starts;
2630 goto utf16Error;
2631 }
2632 if (0xD800 <= ch && ch <= 0xDBFF) {
2633 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2634 q += 2;
2635 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002636#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002637 *p++ = ch;
2638 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002639#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002640 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002641#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002642 continue;
2643 }
2644 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002645 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002646 startinpos = (((const char *)q)-4)-starts;
2647 endinpos = startinpos+2;
2648 goto utf16Error;
2649 }
2650
Benjamin Peterson857ce152009-01-31 16:29:18 +00002651 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002652 errmsg = "illegal encoding";
2653 startinpos = (((const char *)q)-2)-starts;
2654 endinpos = startinpos+2;
2655 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002656
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002657 utf16Error:
2658 outpos = p-PyUnicode_AS_UNICODE(unicode);
2659 if (unicode_decode_call_errorhandler(
2660 errors, &errorHandler,
2661 "utf16", errmsg,
2662 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2663 &unicode, &outpos, &p))
2664 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 }
2666
2667 if (byteorder)
2668 *byteorder = bo;
2669
Walter Dörwald69652032004-09-07 20:24:22 +00002670 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002671 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002672
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002674 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675 goto onError;
2676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002677 Py_XDECREF(errorHandler);
2678 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679 return (PyObject *)unicode;
2680
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002681 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683 Py_XDECREF(errorHandler);
2684 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 return NULL;
2686}
2687
Tim Peters772747b2001-08-09 22:21:55 +00002688PyObject *
2689PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002690 Py_ssize_t size,
2691 const char *errors,
2692 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693{
2694 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002695 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002696 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002697#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002698 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002699#else
2700 const int pairs = 0;
2701#endif
Tim Peters772747b2001-08-09 22:21:55 +00002702 /* Offsets from p for storing byte pairs in the right order. */
2703#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2704 int ihi = 1, ilo = 0;
2705#else
2706 int ihi = 0, ilo = 1;
2707#endif
2708
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002709#define STORECHAR(CH) \
2710 do { \
2711 p[ihi] = ((CH) >> 8) & 0xff; \
2712 p[ilo] = (CH) & 0xff; \
2713 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002714 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002716#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002717 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002718 if (s[i] >= 0x10000)
2719 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002720#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002721 /* 2 * (size + pairs + (byteorder == 0)) */
2722 if (size > PY_SSIZE_T_MAX ||
2723 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002724 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002725 nsize = size + pairs + (byteorder == 0);
2726 bytesize = nsize * 2;
2727 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002728 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002729 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 if (v == NULL)
2731 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002733 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002735 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002736 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002737 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002738
2739 if (byteorder == -1) {
2740 /* force LE */
2741 ihi = 1;
2742 ilo = 0;
2743 }
2744 else if (byteorder == 1) {
2745 /* force BE */
2746 ihi = 0;
2747 ilo = 1;
2748 }
2749
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002750 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002751 Py_UNICODE ch = *s++;
2752 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002753#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002754 if (ch >= 0x10000) {
2755 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2756 ch = 0xD800 | ((ch-0x10000) >> 10);
2757 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002758#endif
Tim Peters772747b2001-08-09 22:21:55 +00002759 STORECHAR(ch);
2760 if (ch2)
2761 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002764#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765}
2766
2767PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2768{
2769 if (!PyUnicode_Check(unicode)) {
2770 PyErr_BadArgument();
2771 return NULL;
2772 }
2773 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002774 PyUnicode_GET_SIZE(unicode),
2775 NULL,
2776 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777}
2778
2779/* --- Unicode Escape Codec ----------------------------------------------- */
2780
Fredrik Lundh06d12682001-01-24 07:59:11 +00002781static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002782
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002784 Py_ssize_t size,
2785 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002787 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002788 Py_ssize_t startinpos;
2789 Py_ssize_t endinpos;
2790 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002792 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002794 char* message;
2795 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002796 PyObject *errorHandler = NULL;
2797 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002798
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 /* Escaped strings will always be longer than the resulting
2800 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002801 length after conversion to the true value.
2802 (but if the error callback returns a long replacement string
2803 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 v = _PyUnicode_New(size);
2805 if (v == NULL)
2806 goto onError;
2807 if (size == 0)
2808 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002809
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002812
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 while (s < end) {
2814 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002815 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817
2818 /* Non-escape characters are interpreted as Unicode ordinals */
2819 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002820 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 continue;
2822 }
2823
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 /* \ - Escapes */
2826 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002827 c = *s++;
2828 if (s > end)
2829 c = '\0'; /* Invalid after \ */
2830 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002832 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 case '\n': break;
2834 case '\\': *p++ = '\\'; break;
2835 case '\'': *p++ = '\''; break;
2836 case '\"': *p++ = '\"'; break;
2837 case 'b': *p++ = '\b'; break;
2838 case 'f': *p++ = '\014'; break; /* FF */
2839 case 't': *p++ = '\t'; break;
2840 case 'n': *p++ = '\n'; break;
2841 case 'r': *p++ = '\r'; break;
2842 case 'v': *p++ = '\013'; break; /* VT */
2843 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2844
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002845 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 case '0': case '1': case '2': case '3':
2847 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002848 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002849 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002850 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002851 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002852 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002854 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 break;
2856
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002857 /* hex escapes */
2858 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002860 digits = 2;
2861 message = "truncated \\xXX escape";
2862 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002864 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002866 digits = 4;
2867 message = "truncated \\uXXXX escape";
2868 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002870 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002871 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002872 digits = 8;
2873 message = "truncated \\UXXXXXXXX escape";
2874 hexescape:
2875 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002876 if (end - s < digits) {
2877 /* count only hex digits */
2878 for (; s < end; ++s) {
2879 c = (unsigned char)*s;
2880 if (!Py_ISXDIGIT(c))
2881 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002882 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002883 goto error;
2884 }
2885 for (; digits--; ++s) {
2886 c = (unsigned char)*s;
2887 if (!Py_ISXDIGIT(c))
2888 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002889 chr = (chr<<4) & ~0xF;
2890 if (c >= '0' && c <= '9')
2891 chr += c - '0';
2892 else if (c >= 'a' && c <= 'f')
2893 chr += 10 + c - 'a';
2894 else
2895 chr += 10 + c - 'A';
2896 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002897 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002898 /* _decoding_error will have already written into the
2899 target buffer. */
2900 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002901 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002902 /* when we get here, chr is a 32-bit unicode character */
2903 if (chr <= 0xffff)
2904 /* UCS-2 character */
2905 *p++ = (Py_UNICODE) chr;
2906 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002907 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002908 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002909#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002910 *p++ = chr;
2911#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002912 chr -= 0x10000L;
2913 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002914 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002915#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002916 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002917 message = "illegal Unicode character";
2918 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002919 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002920 break;
2921
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002922 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002923 case 'N':
2924 message = "malformed \\N character escape";
2925 if (ucnhash_CAPI == NULL) {
2926 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002927 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002928 if (ucnhash_CAPI == NULL)
2929 goto ucnhashError;
2930 }
2931 if (*s == '{') {
2932 const char *start = s+1;
2933 /* look for the closing brace */
2934 while (*s != '}' && s < end)
2935 s++;
2936 if (s > start && s < end && *s == '}') {
2937 /* found a name. look it up in the unicode database */
2938 message = "unknown Unicode character name";
2939 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002940 if (s - start - 1 <= INT_MAX &&
2941 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002942 goto store;
2943 }
2944 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002945 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002946
2947 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002948 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002949 message = "\\ at end of string";
2950 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002951 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002952 }
2953 else {
2954 *p++ = '\\';
2955 *p++ = (unsigned char)s[-1];
2956 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002957 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002959 continue;
2960
2961 error:
2962 endinpos = s-starts;
2963 outpos = p-PyUnicode_AS_UNICODE(v);
2964 if (unicode_decode_call_errorhandler(
2965 errors, &errorHandler,
2966 "unicodeescape", message,
2967 starts, size, &startinpos, &endinpos, &exc, &s,
2968 &v, &outpos, &p))
2969 goto onError;
2970 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002972 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002973 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002974 Py_XDECREF(errorHandler);
2975 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002977
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002978 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002979 PyErr_SetString(
2980 PyExc_UnicodeError,
2981 "\\N escapes not supported (can't load unicodedata module)"
2982 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002983 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984 Py_XDECREF(errorHandler);
2985 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002986 return NULL;
2987
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002988 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002990 Py_XDECREF(errorHandler);
2991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992 return NULL;
2993}
2994
2995/* Return a Unicode-Escape string version of the Unicode object.
2996
2997 If quotes is true, the string is enclosed in u"" or u'' quotes as
2998 appropriate.
2999
3000*/
3001
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00003002Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003003 Py_ssize_t size,
3004 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00003005{
3006 /* like wcschr, but doesn't stop at NULL characters */
3007
3008 while (size-- > 0) {
3009 if (*s == ch)
3010 return s;
3011 s++;
3012 }
3013
3014 return NULL;
3015}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003016
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017static
3018PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003019 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 int quotes)
3021{
3022 PyObject *repr;
3023 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003025 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00003026#ifdef Py_UNICODE_WIDE
3027 const Py_ssize_t expandsize = 10;
3028#else
3029 const Py_ssize_t expandsize = 6;
3030#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031
Neal Norwitz17753ec2006-08-21 22:21:19 +00003032 /* XXX(nnorwitz): rather than over-allocating, it would be
3033 better to choose a different scheme. Perhaps scan the
3034 first N-chars of the string and allocate based on that size.
3035 */
3036 /* Initial allocation is based on the longest-possible unichr
3037 escape.
3038
3039 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3040 unichr, so in this case it's the longest unichr escape. In
3041 narrow (UTF-16) builds this is five chars per source unichr
3042 since there are two unichrs in the surrogate pair, so in narrow
3043 (UTF-16) builds it's not the longest unichr escape.
3044
3045 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3046 so in the narrow (UTF-16) build case it's the longest unichr
3047 escape.
3048 */
3049
Neal Norwitze7d8be82008-07-31 17:17:14 +00003050 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003051 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003052
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003053 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003054 2
3055 + expandsize*size
3056 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 if (repr == NULL)
3058 return NULL;
3059
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003060 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061
3062 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003064 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 !findchar(s, size, '"')) ? '"' : '\'';
3066 }
3067 while (size-- > 0) {
3068 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003069
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003070 /* Escape quotes and backslashes */
3071 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003072 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 *p++ = '\\';
3074 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003075 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003076 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003077
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003078#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003079 /* Map 21-bit characters to '\U00xxxxxx' */
3080 else if (ch >= 0x10000) {
3081 *p++ = '\\';
3082 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003083 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3084 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3085 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3086 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3087 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3088 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3089 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003090 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003091 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003092 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003093#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003094 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3095 else if (ch >= 0xD800 && ch < 0xDC00) {
3096 Py_UNICODE ch2;
3097 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003098
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003099 ch2 = *s++;
3100 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003101 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003102 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3103 *p++ = '\\';
3104 *p++ = 'U';
3105 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3106 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3107 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3108 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3109 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3110 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3111 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3112 *p++ = hexdigit[ucs & 0x0000000F];
3113 continue;
3114 }
3115 /* Fall through: isolated surrogates are copied as-is */
3116 s--;
3117 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003118 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003119#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003120
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003122 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 *p++ = '\\';
3124 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003125 *p++ = hexdigit[(ch >> 12) & 0x000F];
3126 *p++ = hexdigit[(ch >> 8) & 0x000F];
3127 *p++ = hexdigit[(ch >> 4) & 0x000F];
3128 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003130
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003131 /* Map special whitespace to '\t', \n', '\r' */
3132 else if (ch == '\t') {
3133 *p++ = '\\';
3134 *p++ = 't';
3135 }
3136 else if (ch == '\n') {
3137 *p++ = '\\';
3138 *p++ = 'n';
3139 }
3140 else if (ch == '\r') {
3141 *p++ = '\\';
3142 *p++ = 'r';
3143 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003144
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003145 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003146 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003148 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003149 *p++ = hexdigit[(ch >> 4) & 0x000F];
3150 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003151 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003152
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153 /* Copy everything else as-is */
3154 else
3155 *p++ = (char) ch;
3156 }
3157 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003158 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159
3160 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003161 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3162 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163 return repr;
3164}
3165
3166PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003167 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168{
3169 return unicodeescape_string(s, size, 0);
3170}
3171
3172PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3173{
3174 if (!PyUnicode_Check(unicode)) {
3175 PyErr_BadArgument();
3176 return NULL;
3177 }
3178 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003179 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180}
3181
3182/* --- Raw Unicode Escape Codec ------------------------------------------- */
3183
3184PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003185 Py_ssize_t size,
3186 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003188 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003189 Py_ssize_t startinpos;
3190 Py_ssize_t endinpos;
3191 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003193 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 const char *end;
3195 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003196 PyObject *errorHandler = NULL;
3197 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003198
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 /* Escaped strings will always be longer than the resulting
3200 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003201 length after conversion to the true value. (But decoding error
3202 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 v = _PyUnicode_New(size);
3204 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003205 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003207 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 end = s + size;
3210 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003211 unsigned char c;
3212 Py_UCS4 x;
3213 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003214 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003216 /* Non-escape characters are interpreted as Unicode ordinals */
3217 if (*s != '\\') {
3218 *p++ = (unsigned char)*s++;
3219 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003220 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003221 startinpos = s-starts;
3222
3223 /* \u-escapes are only interpreted iff the number of leading
3224 backslashes if odd */
3225 bs = s;
3226 for (;s < end;) {
3227 if (*s != '\\')
3228 break;
3229 *p++ = (unsigned char)*s++;
3230 }
3231 if (((s - bs) & 1) == 0 ||
3232 s >= end ||
3233 (*s != 'u' && *s != 'U')) {
3234 continue;
3235 }
3236 p--;
3237 count = *s=='u' ? 4 : 8;
3238 s++;
3239
3240 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3241 outpos = p-PyUnicode_AS_UNICODE(v);
3242 for (x = 0, i = 0; i < count; ++i, ++s) {
3243 c = (unsigned char)*s;
3244 if (!isxdigit(c)) {
3245 endinpos = s-starts;
3246 if (unicode_decode_call_errorhandler(
3247 errors, &errorHandler,
3248 "rawunicodeescape", "truncated \\uXXXX",
3249 starts, size, &startinpos, &endinpos, &exc, &s,
3250 &v, &outpos, &p))
3251 goto onError;
3252 goto nextByte;
3253 }
3254 x = (x<<4) & ~0xF;
3255 if (c >= '0' && c <= '9')
3256 x += c - '0';
3257 else if (c >= 'a' && c <= 'f')
3258 x += 10 + c - 'a';
3259 else
3260 x += 10 + c - 'A';
3261 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003262 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003263 /* UCS-2 character */
3264 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003265 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003266 /* UCS-4 character. Either store directly, or as
3267 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003268#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003269 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003270#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003271 x -= 0x10000L;
3272 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3273 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003274#endif
3275 } else {
3276 endinpos = s-starts;
3277 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003278 if (unicode_decode_call_errorhandler(
3279 errors, &errorHandler,
3280 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003281 starts, size, &startinpos, &endinpos, &exc, &s,
3282 &v, &outpos, &p))
3283 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003284 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003285 nextByte:
3286 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003288 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 Py_XDECREF(errorHandler);
3291 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003293
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003294 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 Py_XDECREF(errorHandler);
3297 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 return NULL;
3299}
3300
3301PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003302 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303{
3304 PyObject *repr;
3305 char *p;
3306 char *q;
3307
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003308 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003309#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003310 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003311#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003312 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003313#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003314
Neal Norwitze7d8be82008-07-31 17:17:14 +00003315 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003316 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003317
Neal Norwitze7d8be82008-07-31 17:17:14 +00003318 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319 if (repr == NULL)
3320 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003321 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003322 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003324 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 while (size-- > 0) {
3326 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003327#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003328 /* Map 32-bit characters to '\Uxxxxxxxx' */
3329 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003330 *p++ = '\\';
3331 *p++ = 'U';
3332 *p++ = hexdigit[(ch >> 28) & 0xf];
3333 *p++ = hexdigit[(ch >> 24) & 0xf];
3334 *p++ = hexdigit[(ch >> 20) & 0xf];
3335 *p++ = hexdigit[(ch >> 16) & 0xf];
3336 *p++ = hexdigit[(ch >> 12) & 0xf];
3337 *p++ = hexdigit[(ch >> 8) & 0xf];
3338 *p++ = hexdigit[(ch >> 4) & 0xf];
3339 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003340 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003341 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003342#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003343 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3344 if (ch >= 0xD800 && ch < 0xDC00) {
3345 Py_UNICODE ch2;
3346 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003347
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003348 ch2 = *s++;
3349 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003350 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003351 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3352 *p++ = '\\';
3353 *p++ = 'U';
3354 *p++ = hexdigit[(ucs >> 28) & 0xf];
3355 *p++ = hexdigit[(ucs >> 24) & 0xf];
3356 *p++ = hexdigit[(ucs >> 20) & 0xf];
3357 *p++ = hexdigit[(ucs >> 16) & 0xf];
3358 *p++ = hexdigit[(ucs >> 12) & 0xf];
3359 *p++ = hexdigit[(ucs >> 8) & 0xf];
3360 *p++ = hexdigit[(ucs >> 4) & 0xf];
3361 *p++ = hexdigit[ucs & 0xf];
3362 continue;
3363 }
3364 /* Fall through: isolated surrogates are copied as-is */
3365 s--;
3366 size++;
3367 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003368#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003369 /* Map 16-bit characters to '\uxxxx' */
3370 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 *p++ = '\\';
3372 *p++ = 'u';
3373 *p++ = hexdigit[(ch >> 12) & 0xf];
3374 *p++ = hexdigit[(ch >> 8) & 0xf];
3375 *p++ = hexdigit[(ch >> 4) & 0xf];
3376 *p++ = hexdigit[ch & 15];
3377 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003378 /* Copy everything else as-is */
3379 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 *p++ = (char) ch;
3381 }
3382 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003383 if (_PyString_Resize(&repr, p - q))
3384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 return repr;
3386}
3387
3388PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3389{
3390 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003391 PyErr_BadArgument();
3392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 }
3394 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003395 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396}
3397
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003398/* --- Unicode Internal Codec ------------------------------------------- */
3399
3400PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003401 Py_ssize_t size,
3402 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003403{
3404 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003405 Py_ssize_t startinpos;
3406 Py_ssize_t endinpos;
3407 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003408 PyUnicodeObject *v;
3409 Py_UNICODE *p;
3410 const char *end;
3411 const char *reason;
3412 PyObject *errorHandler = NULL;
3413 PyObject *exc = NULL;
3414
Neal Norwitzd43069c2006-01-08 01:12:10 +00003415#ifdef Py_UNICODE_WIDE
3416 Py_UNICODE unimax = PyUnicode_GetMax();
3417#endif
3418
Armin Rigo7ccbca92006-10-04 12:17:45 +00003419 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003420 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3421 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003422 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003423 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003424 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003425 p = PyUnicode_AS_UNICODE(v);
3426 end = s + size;
3427
3428 while (s < end) {
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003429 if (end-s < Py_UNICODE_SIZE) {
3430 endinpos = end-starts;
3431 reason = "truncated input";
3432 goto error;
3433 }
Neal Norwitz1004a532006-05-15 07:17:23 +00003434 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003435#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003436 /* We have to sanity check the raw data, otherwise doom looms for
3437 some malformed UCS-4 data. */
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003438 if (*p > unimax || *p < 0) {
3439 endinpos = s - starts + Py_UNICODE_SIZE;
3440 reason = "illegal code point (> 0x10FFFF)";
3441 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003442 }
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003443#endif
3444 p++;
3445 s += Py_UNICODE_SIZE;
3446 continue;
3447
3448 error:
3449 startinpos = s - starts;
3450 outpos = p - PyUnicode_AS_UNICODE(v);
3451 if (unicode_decode_call_errorhandler(
3452 errors, &errorHandler,
3453 "unicode_internal", reason,
3454 starts, size, &startinpos, &endinpos, &exc, &s,
3455 &v, &outpos, &p)) {
3456 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003457 }
3458 }
3459
Martin v. Löwis412fb672006-04-13 06:34:32 +00003460 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003461 goto onError;
3462 Py_XDECREF(errorHandler);
3463 Py_XDECREF(exc);
3464 return (PyObject *)v;
3465
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003466 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003467 Py_XDECREF(v);
3468 Py_XDECREF(errorHandler);
3469 Py_XDECREF(exc);
3470 return NULL;
3471}
3472
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473/* --- Latin-1 Codec ------------------------------------------------------ */
3474
3475PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003476 Py_ssize_t size,
3477 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478{
3479 PyUnicodeObject *v;
3480 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003481
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003483 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003484 Py_UNICODE r = *(unsigned char*)s;
3485 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003486 }
3487
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 v = _PyUnicode_New(size);
3489 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003490 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003492 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 p = PyUnicode_AS_UNICODE(v);
3494 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003495 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003497
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003498 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499 Py_XDECREF(v);
3500 return NULL;
3501}
3502
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503/* create or adjust a UnicodeEncodeError */
3504static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003505 const char *encoding,
3506 const Py_UNICODE *unicode, Py_ssize_t size,
3507 Py_ssize_t startpos, Py_ssize_t endpos,
3508 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003509{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003511 *exceptionObject = PyUnicodeEncodeError_Create(
3512 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 }
3514 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003515 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3516 goto onError;
3517 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3518 goto onError;
3519 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3520 goto onError;
3521 return;
3522 onError:
3523 Py_DECREF(*exceptionObject);
3524 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 }
3526}
3527
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528/* raises a UnicodeEncodeError */
3529static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003530 const char *encoding,
3531 const Py_UNICODE *unicode, Py_ssize_t size,
3532 Py_ssize_t startpos, Py_ssize_t endpos,
3533 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534{
3535 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003536 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003538 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539}
3540
3541/* error handling callback helper:
3542 build arguments, call the callback and check the arguments,
3543 put the result into newpos and return the replacement string, which
3544 has to be freed by the caller */
3545static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003546 PyObject **errorHandler,
3547 const char *encoding, const char *reason,
3548 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3549 Py_ssize_t startpos, Py_ssize_t endpos,
3550 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003552 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553
3554 PyObject *restuple;
3555 PyObject *resunicode;
3556
3557 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003558 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003560 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 }
3562
3563 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003564 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003566 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567
3568 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003569 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003571 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003573 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003574 Py_DECREF(restuple);
3575 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 }
3577 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003578 &resunicode, newpos)) {
3579 Py_DECREF(restuple);
3580 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581 }
3582 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003583 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003584 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003585 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3586 Py_DECREF(restuple);
3587 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003588 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 Py_INCREF(resunicode);
3590 Py_DECREF(restuple);
3591 return resunicode;
3592}
3593
3594static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003595 Py_ssize_t size,
3596 const char *errors,
3597 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598{
3599 /* output object */
3600 PyObject *res;
3601 /* pointers to the beginning and end+1 of input */
3602 const Py_UNICODE *startp = p;
3603 const Py_UNICODE *endp = p + size;
3604 /* pointer to the beginning of the unencodable characters */
3605 /* const Py_UNICODE *badp = NULL; */
3606 /* pointer into the output */
3607 char *str;
3608 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003609 Py_ssize_t respos = 0;
3610 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003611 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3612 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613 PyObject *errorHandler = NULL;
3614 PyObject *exc = NULL;
3615 /* the following variable is used for caching string comparisons
3616 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3617 int known_errorHandler = -1;
3618
3619 /* allocate enough for a simple encoding without
3620 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003621 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 if (res == NULL)
3623 goto onError;
3624 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003625 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003626 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 ressize = size;
3628
3629 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003630 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003632 /* can we encode this? */
3633 if (c<limit) {
3634 /* no overflow check, because we know that the space is enough */
3635 *str++ = (char)c;
3636 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003637 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003638 else {
3639 Py_ssize_t unicodepos = p-startp;
3640 Py_ssize_t requiredsize;
3641 PyObject *repunicode;
3642 Py_ssize_t repsize;
3643 Py_ssize_t newpos;
3644 Py_ssize_t respos;
3645 Py_UNICODE *uni2;
3646 /* startpos for collecting unencodable chars */
3647 const Py_UNICODE *collstart = p;
3648 const Py_UNICODE *collend = p;
3649 /* find all unecodable characters */
3650 while ((collend < endp) && ((*collend)>=limit))
3651 ++collend;
3652 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3653 if (known_errorHandler==-1) {
3654 if ((errors==NULL) || (!strcmp(errors, "strict")))
3655 known_errorHandler = 1;
3656 else if (!strcmp(errors, "replace"))
3657 known_errorHandler = 2;
3658 else if (!strcmp(errors, "ignore"))
3659 known_errorHandler = 3;
3660 else if (!strcmp(errors, "xmlcharrefreplace"))
3661 known_errorHandler = 4;
3662 else
3663 known_errorHandler = 0;
3664 }
3665 switch (known_errorHandler) {
3666 case 1: /* strict */
3667 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3668 goto onError;
3669 case 2: /* replace */
3670 while (collstart++<collend)
3671 *str++ = '?'; /* fall through */
3672 case 3: /* ignore */
3673 p = collend;
3674 break;
3675 case 4: /* xmlcharrefreplace */
3676 respos = str-PyString_AS_STRING(res);
3677 /* determine replacement size (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03003678 for (p = collstart, repsize = 0; p < collend;) {
3679 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3680 if (ch < 10)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003681 repsize += 2+1+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003682 else if (ch < 100)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003683 repsize += 2+2+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003684 else if (ch < 1000)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003685 repsize += 2+3+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003686 else if (ch < 10000)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003687 repsize += 2+4+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003688 else if (ch < 100000)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003689 repsize += 2+5+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003690 else if (ch < 1000000)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003691 repsize += 2+6+1;
3692 else
3693 repsize += 2+7+1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003694 }
3695 requiredsize = respos+repsize+(endp-collend);
3696 if (requiredsize > ressize) {
3697 if (requiredsize<2*ressize)
3698 requiredsize = 2*ressize;
3699 if (_PyString_Resize(&res, requiredsize))
3700 goto onError;
3701 str = PyString_AS_STRING(res) + respos;
3702 ressize = requiredsize;
3703 }
3704 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03003705 for (p = collstart; p < collend;) {
3706 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3707 str += sprintf(str, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003708 }
3709 p = collend;
3710 break;
3711 default:
3712 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3713 encoding, reason, startp, size, &exc,
3714 collstart-startp, collend-startp, &newpos);
3715 if (repunicode == NULL)
3716 goto onError;
3717 /* need more space? (at least enough for what we have+the
3718 replacement+the rest of the string, so we won't have to
3719 check space for encodable characters) */
3720 respos = str-PyString_AS_STRING(res);
3721 repsize = PyUnicode_GET_SIZE(repunicode);
3722 requiredsize = respos+repsize+(endp-collend);
3723 if (requiredsize > ressize) {
3724 if (requiredsize<2*ressize)
3725 requiredsize = 2*ressize;
3726 if (_PyString_Resize(&res, requiredsize)) {
3727 Py_DECREF(repunicode);
3728 goto onError;
3729 }
3730 str = PyString_AS_STRING(res) + respos;
3731 ressize = requiredsize;
3732 }
3733 /* check if there is anything unencodable in the replacement
3734 and copy it to the output */
3735 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3736 c = *uni2;
3737 if (c >= limit) {
3738 raise_encode_exception(&exc, encoding, startp, size,
3739 unicodepos, unicodepos+1, reason);
3740 Py_DECREF(repunicode);
3741 goto onError;
3742 }
3743 *str = (char)c;
3744 }
3745 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003746 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003747 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003748 }
3749 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003751 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003753 /* If this falls res will be NULL */
3754 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003755 Py_XDECREF(errorHandler);
3756 Py_XDECREF(exc);
3757 return res;
3758
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003759 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 Py_XDECREF(res);
3761 Py_XDECREF(errorHandler);
3762 Py_XDECREF(exc);
3763 return NULL;
3764}
3765
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003767 Py_ssize_t size,
3768 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771}
3772
3773PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3774{
3775 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003776 PyErr_BadArgument();
3777 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 }
3779 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003780 PyUnicode_GET_SIZE(unicode),
3781 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782}
3783
3784/* --- 7-bit ASCII Codec -------------------------------------------------- */
3785
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003787 Py_ssize_t size,
3788 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003790 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791 PyUnicodeObject *v;
3792 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003793 Py_ssize_t startinpos;
3794 Py_ssize_t endinpos;
3795 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 const char *e;
3797 PyObject *errorHandler = NULL;
3798 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003799
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003801 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003802 Py_UNICODE r = *(unsigned char*)s;
3803 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003804 }
Tim Petersced69f82003-09-16 20:30:58 +00003805
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 v = _PyUnicode_New(size);
3807 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003808 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003810 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 e = s + size;
3813 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003814 register unsigned char c = (unsigned char)*s;
3815 if (c < 128) {
3816 *p++ = c;
3817 ++s;
3818 }
3819 else {
3820 startinpos = s-starts;
3821 endinpos = startinpos + 1;
3822 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3823 if (unicode_decode_call_errorhandler(
3824 errors, &errorHandler,
3825 "ascii", "ordinal not in range(128)",
3826 starts, size, &startinpos, &endinpos, &exc, &s,
3827 &v, &outpos, &p))
3828 goto onError;
3829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003831 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003832 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3833 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003834 Py_XDECREF(errorHandler);
3835 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003837
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003838 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840 Py_XDECREF(errorHandler);
3841 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842 return NULL;
3843}
3844
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003846 Py_ssize_t size,
3847 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003849 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850}
3851
3852PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3853{
3854 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003855 PyErr_BadArgument();
3856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857 }
3858 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003859 PyUnicode_GET_SIZE(unicode),
3860 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861}
3862
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003863#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003864
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003865/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003866
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003867#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003868#define NEED_RETRY
3869#endif
3870
3871/* XXX This code is limited to "true" double-byte encodings, as
3872 a) it assumes an incomplete character consists of a single byte, and
3873 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003874 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003875
3876static int is_dbcs_lead_byte(const char *s, int offset)
3877{
3878 const char *curr = s + offset;
3879
3880 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003881 const char *prev = CharPrev(s, curr);
3882 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003883 }
3884 return 0;
3885}
3886
3887/*
3888 * Decode MBCS string into unicode object. If 'final' is set, converts
3889 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3890 */
3891static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003892 const char *s, /* MBCS string */
3893 int size, /* sizeof MBCS string */
3894 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003895{
3896 Py_UNICODE *p;
3897 Py_ssize_t n = 0;
3898 int usize = 0;
3899
3900 assert(size >= 0);
3901
3902 /* Skip trailing lead-byte unless 'final' is set */
3903 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003904 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003905
3906 /* First get the size of the result */
3907 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003908 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3909 if (usize == 0) {
3910 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3911 return -1;
3912 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003913 }
3914
3915 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003916 /* Create unicode object */
3917 *v = _PyUnicode_New(usize);
3918 if (*v == NULL)
3919 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003920 }
3921 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003922 /* Extend unicode object */
3923 n = PyUnicode_GET_SIZE(*v);
3924 if (_PyUnicode_Resize(v, n + usize) < 0)
3925 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003926 }
3927
3928 /* Do the conversion */
3929 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003930 p = PyUnicode_AS_UNICODE(*v) + n;
3931 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3932 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3933 return -1;
3934 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003935 }
3936
3937 return size;
3938}
3939
3940PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003941 Py_ssize_t size,
3942 const char *errors,
3943 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003944{
3945 PyUnicodeObject *v = NULL;
3946 int done;
3947
3948 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003949 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003950
3951#ifdef NEED_RETRY
3952 retry:
3953 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003954 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003955 else
3956#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003957 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003958
3959 if (done < 0) {
3960 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003961 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003962 }
3963
3964 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003965 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003966
3967#ifdef NEED_RETRY
3968 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003969 s += done;
3970 size -= done;
3971 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003972 }
3973#endif
3974
3975 return (PyObject *)v;
3976}
3977
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003978PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003979 Py_ssize_t size,
3980 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003981{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003982 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3983}
3984
3985/*
3986 * Convert unicode into string object (MBCS).
3987 * Returns 0 if succeed, -1 otherwise.
3988 */
3989static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003990 const Py_UNICODE *p, /* unicode */
3991 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003992{
3993 int mbcssize = 0;
3994 Py_ssize_t n = 0;
3995
3996 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003997
3998 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003999 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004000 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4001 if (mbcssize == 0) {
4002 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4003 return -1;
4004 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004005 }
4006
Martin v. Löwisd8251432006-06-14 05:21:04 +00004007 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004008 /* Create string object */
4009 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4010 if (*repr == NULL)
4011 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004012 }
4013 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004014 /* Extend string object */
4015 n = PyString_Size(*repr);
4016 if (_PyString_Resize(repr, n + mbcssize) < 0)
4017 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004018 }
4019
4020 /* Do the conversion */
4021 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004022 char *s = PyString_AS_STRING(*repr) + n;
4023 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4024 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4025 return -1;
4026 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004027 }
4028
4029 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004030}
4031
4032PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004033 Py_ssize_t size,
4034 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004035{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004036 PyObject *repr = NULL;
4037 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004038
Martin v. Löwisd8251432006-06-14 05:21:04 +00004039#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004040 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004041 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004042 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004043 else
4044#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004045 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004046
Martin v. Löwisd8251432006-06-14 05:21:04 +00004047 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004048 Py_XDECREF(repr);
4049 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004050 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004051
4052#ifdef NEED_RETRY
4053 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004054 p += INT_MAX;
4055 size -= INT_MAX;
4056 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004057 }
4058#endif
4059
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004060 return repr;
4061}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004062
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004063PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4064{
4065 if (!PyUnicode_Check(unicode)) {
4066 PyErr_BadArgument();
4067 return NULL;
4068 }
4069 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004070 PyUnicode_GET_SIZE(unicode),
4071 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004072}
4073
Martin v. Löwisd8251432006-06-14 05:21:04 +00004074#undef NEED_RETRY
4075
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004076#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004077
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078/* --- Character Mapping Codec -------------------------------------------- */
4079
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004081 Py_ssize_t size,
4082 PyObject *mapping,
4083 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004086 Py_ssize_t startinpos;
4087 Py_ssize_t endinpos;
4088 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 PyUnicodeObject *v;
4091 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004092 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093 PyObject *errorHandler = NULL;
4094 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004095 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004096 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004097
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098 /* Default to Latin-1 */
4099 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004100 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101
4102 v = _PyUnicode_New(size);
4103 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004104 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004106 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004109 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004110 mapstring = PyUnicode_AS_UNICODE(mapping);
4111 maplen = PyUnicode_GET_SIZE(mapping);
4112 while (s < e) {
4113 unsigned char ch = *s;
4114 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004116 if (ch < maplen)
4117 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004119 if (x == 0xfffe) {
4120 /* undefined mapping */
4121 outpos = p-PyUnicode_AS_UNICODE(v);
4122 startinpos = s-starts;
4123 endinpos = startinpos+1;
4124 if (unicode_decode_call_errorhandler(
4125 errors, &errorHandler,
4126 "charmap", "character maps to <undefined>",
4127 starts, size, &startinpos, &endinpos, &exc, &s,
4128 &v, &outpos, &p)) {
4129 goto onError;
4130 }
4131 continue;
4132 }
4133 *p++ = x;
4134 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004135 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004136 }
4137 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004138 while (s < e) {
4139 unsigned char ch = *s;
4140 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004141
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004142 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4143 w = PyInt_FromLong((long)ch);
4144 if (w == NULL)
4145 goto onError;
4146 x = PyObject_GetItem(mapping, w);
4147 Py_DECREF(w);
4148 if (x == NULL) {
4149 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4150 /* No mapping found means: mapping is undefined. */
4151 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004152 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004153 } else
4154 goto onError;
4155 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004156
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004157 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004158 if (x == Py_None)
4159 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004160 if (PyInt_Check(x)) {
4161 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004162 if (value == 0xFFFE)
4163 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004164 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004165 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004166 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004167 Py_DECREF(x);
4168 goto onError;
4169 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004170
4171#ifndef Py_UNICODE_WIDE
4172 if (value > 0xFFFF) {
4173 /* see the code for 1-n mapping below */
4174 if (extrachars < 2) {
4175 /* resize first */
4176 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4177 Py_ssize_t needed = 10 - extrachars;
4178 extrachars += needed;
4179 /* XXX overflow detection missing */
4180 if (_PyUnicode_Resize(&v,
4181 PyUnicode_GET_SIZE(v) + needed) < 0) {
4182 Py_DECREF(x);
4183 goto onError;
4184 }
4185 p = PyUnicode_AS_UNICODE(v) + oldpos;
4186 }
4187 value -= 0x10000;
4188 *p++ = 0xD800 | (value >> 10);
4189 *p++ = 0xDC00 | (value & 0x3FF);
4190 extrachars -= 2;
4191 }
4192 else
4193#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004194 *p++ = (Py_UNICODE)value;
4195 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004196 else if (PyUnicode_Check(x)) {
4197 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004198
Serhiy Storchaka95997452013-01-15 14:42:59 +02004199 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004200 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004201 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4202 if (value == 0xFFFE)
4203 goto Undefined;
4204 *p++ = value;
4205 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004206 else if (targetsize > 1) {
4207 /* 1-n mapping */
4208 if (targetsize > extrachars) {
4209 /* resize first */
4210 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4211 Py_ssize_t needed = (targetsize - extrachars) + \
4212 (targetsize << 2);
4213 extrachars += needed;
4214 /* XXX overflow detection missing */
4215 if (_PyUnicode_Resize(&v,
4216 PyUnicode_GET_SIZE(v) + needed) < 0) {
4217 Py_DECREF(x);
4218 goto onError;
4219 }
4220 p = PyUnicode_AS_UNICODE(v) + oldpos;
4221 }
4222 Py_UNICODE_COPY(p,
4223 PyUnicode_AS_UNICODE(x),
4224 targetsize);
4225 p += targetsize;
4226 extrachars -= targetsize;
4227 }
4228 /* 1-0 mapping: skip the character */
4229 }
4230 else {
4231 /* wrong return value */
4232 PyErr_SetString(PyExc_TypeError,
4233 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004234 Py_DECREF(x);
4235 goto onError;
4236 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004237 Py_DECREF(x);
4238 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004239 continue;
4240Undefined:
4241 /* undefined mapping */
4242 Py_XDECREF(x);
4243 outpos = p-PyUnicode_AS_UNICODE(v);
4244 startinpos = s-starts;
4245 endinpos = startinpos+1;
4246 if (unicode_decode_call_errorhandler(
4247 errors, &errorHandler,
4248 "charmap", "character maps to <undefined>",
4249 starts, size, &startinpos, &endinpos, &exc, &s,
4250 &v, &outpos, &p)) {
4251 goto onError;
4252 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 }
4255 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004256 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4257 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 Py_XDECREF(errorHandler);
4259 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004261
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004262 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 Py_XDECREF(errorHandler);
4264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265 Py_XDECREF(v);
4266 return NULL;
4267}
4268
Martin v. Löwis3f767792006-06-04 19:36:28 +00004269/* Charmap encoding: the lookup table */
4270
4271struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004272 PyObject_HEAD
4273 unsigned char level1[32];
4274 int count2, count3;
4275 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004276};
4277
4278static PyObject*
4279encoding_map_size(PyObject *obj, PyObject* args)
4280{
4281 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004282 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004283 128*map->count3);
4284}
4285
4286static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004287 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004288 PyDoc_STR("Return the size (in bytes) of this object") },
4289 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004290};
4291
4292static void
4293encoding_map_dealloc(PyObject* o)
4294{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004295 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004296}
4297
4298static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004299 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004300 "EncodingMap", /*tp_name*/
4301 sizeof(struct encoding_map), /*tp_basicsize*/
4302 0, /*tp_itemsize*/
4303 /* methods */
4304 encoding_map_dealloc, /*tp_dealloc*/
4305 0, /*tp_print*/
4306 0, /*tp_getattr*/
4307 0, /*tp_setattr*/
4308 0, /*tp_compare*/
4309 0, /*tp_repr*/
4310 0, /*tp_as_number*/
4311 0, /*tp_as_sequence*/
4312 0, /*tp_as_mapping*/
4313 0, /*tp_hash*/
4314 0, /*tp_call*/
4315 0, /*tp_str*/
4316 0, /*tp_getattro*/
4317 0, /*tp_setattro*/
4318 0, /*tp_as_buffer*/
4319 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4320 0, /*tp_doc*/
4321 0, /*tp_traverse*/
4322 0, /*tp_clear*/
4323 0, /*tp_richcompare*/
4324 0, /*tp_weaklistoffset*/
4325 0, /*tp_iter*/
4326 0, /*tp_iternext*/
4327 encoding_map_methods, /*tp_methods*/
4328 0, /*tp_members*/
4329 0, /*tp_getset*/
4330 0, /*tp_base*/
4331 0, /*tp_dict*/
4332 0, /*tp_descr_get*/
4333 0, /*tp_descr_set*/
4334 0, /*tp_dictoffset*/
4335 0, /*tp_init*/
4336 0, /*tp_alloc*/
4337 0, /*tp_new*/
4338 0, /*tp_free*/
4339 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004340};
4341
4342PyObject*
4343PyUnicode_BuildEncodingMap(PyObject* string)
4344{
4345 Py_UNICODE *decode;
4346 PyObject *result;
4347 struct encoding_map *mresult;
4348 int i;
4349 int need_dict = 0;
4350 unsigned char level1[32];
4351 unsigned char level2[512];
4352 unsigned char *mlevel1, *mlevel2, *mlevel3;
4353 int count2 = 0, count3 = 0;
4354
4355 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4356 PyErr_BadArgument();
4357 return NULL;
4358 }
4359 decode = PyUnicode_AS_UNICODE(string);
4360 memset(level1, 0xFF, sizeof level1);
4361 memset(level2, 0xFF, sizeof level2);
4362
4363 /* If there isn't a one-to-one mapping of NULL to \0,
4364 or if there are non-BMP characters, we need to use
4365 a mapping dictionary. */
4366 if (decode[0] != 0)
4367 need_dict = 1;
4368 for (i = 1; i < 256; i++) {
4369 int l1, l2;
4370 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004371#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004372 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004373#endif
4374 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004375 need_dict = 1;
4376 break;
4377 }
4378 if (decode[i] == 0xFFFE)
4379 /* unmapped character */
4380 continue;
4381 l1 = decode[i] >> 11;
4382 l2 = decode[i] >> 7;
4383 if (level1[l1] == 0xFF)
4384 level1[l1] = count2++;
4385 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004386 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004387 }
4388
4389 if (count2 >= 0xFF || count3 >= 0xFF)
4390 need_dict = 1;
4391
4392 if (need_dict) {
4393 PyObject *result = PyDict_New();
4394 PyObject *key, *value;
4395 if (!result)
4396 return NULL;
4397 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004398 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004399 key = PyInt_FromLong(decode[i]);
4400 value = PyInt_FromLong(i);
4401 if (!key || !value)
4402 goto failed1;
4403 if (PyDict_SetItem(result, key, value) == -1)
4404 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004405 Py_DECREF(key);
4406 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004407 }
4408 return result;
4409 failed1:
4410 Py_XDECREF(key);
4411 Py_XDECREF(value);
4412 Py_DECREF(result);
4413 return NULL;
4414 }
4415
4416 /* Create a three-level trie */
4417 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4418 16*count2 + 128*count3 - 1);
4419 if (!result)
4420 return PyErr_NoMemory();
4421 PyObject_Init(result, &EncodingMapType);
4422 mresult = (struct encoding_map*)result;
4423 mresult->count2 = count2;
4424 mresult->count3 = count3;
4425 mlevel1 = mresult->level1;
4426 mlevel2 = mresult->level23;
4427 mlevel3 = mresult->level23 + 16*count2;
4428 memcpy(mlevel1, level1, 32);
4429 memset(mlevel2, 0xFF, 16*count2);
4430 memset(mlevel3, 0, 128*count3);
4431 count3 = 0;
4432 for (i = 1; i < 256; i++) {
4433 int o1, o2, o3, i2, i3;
4434 if (decode[i] == 0xFFFE)
4435 /* unmapped character */
4436 continue;
4437 o1 = decode[i]>>11;
4438 o2 = (decode[i]>>7) & 0xF;
4439 i2 = 16*mlevel1[o1] + o2;
4440 if (mlevel2[i2] == 0xFF)
4441 mlevel2[i2] = count3++;
4442 o3 = decode[i] & 0x7F;
4443 i3 = 128*mlevel2[i2] + o3;
4444 mlevel3[i3] = i;
4445 }
4446 return result;
4447}
4448
4449static int
4450encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4451{
4452 struct encoding_map *map = (struct encoding_map*)mapping;
4453 int l1 = c>>11;
4454 int l2 = (c>>7) & 0xF;
4455 int l3 = c & 0x7F;
4456 int i;
4457
4458#ifdef Py_UNICODE_WIDE
4459 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004460 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004461 }
4462#endif
4463 if (c == 0)
4464 return 0;
4465 /* level 1*/
4466 i = map->level1[l1];
4467 if (i == 0xFF) {
4468 return -1;
4469 }
4470 /* level 2*/
4471 i = map->level23[16*i+l2];
4472 if (i == 0xFF) {
4473 return -1;
4474 }
4475 /* level 3 */
4476 i = map->level23[16*map->count2 + 128*i + l3];
4477 if (i == 0) {
4478 return -1;
4479 }
4480 return i;
4481}
4482
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483/* Lookup the character ch in the mapping. If the character
4484 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004485 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 PyObject *w = PyInt_FromLong((long)c);
4489 PyObject *x;
4490
4491 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004492 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004493 x = PyObject_GetItem(mapping, w);
4494 Py_DECREF(w);
4495 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004496 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4497 /* No mapping found means: mapping is undefined. */
4498 PyErr_Clear();
4499 x = Py_None;
4500 Py_INCREF(x);
4501 return x;
4502 } else
4503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004505 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004506 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004508 long value = PyInt_AS_LONG(x);
4509 if (value < 0 || value > 255) {
4510 PyErr_SetString(PyExc_TypeError,
4511 "character mapping must be in range(256)");
4512 Py_DECREF(x);
4513 return NULL;
4514 }
4515 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004517 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004518 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004520 /* wrong return value */
4521 PyErr_SetString(PyExc_TypeError,
4522 "character mapping must return integer, None or str");
4523 Py_DECREF(x);
4524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 }
4526}
4527
Martin v. Löwis3f767792006-06-04 19:36:28 +00004528static int
4529charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4530{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004531 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4532 /* exponentially overallocate to minimize reallocations */
4533 if (requiredsize < 2*outsize)
4534 requiredsize = 2*outsize;
4535 if (_PyString_Resize(outobj, requiredsize)) {
4536 return 0;
4537 }
4538 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004539}
4540
Benjamin Peterson857ce152009-01-31 16:29:18 +00004541typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004542 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004543}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544/* lookup the character, put the result in the output string and adjust
4545 various state variables. Reallocate the output string if not enough
4546 space is available. Return a new reference to the object that
4547 was put in the output buffer, or Py_None, if the mapping was undefined
4548 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004549 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004551charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004552 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004554 PyObject *rep;
4555 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004556 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557
Christian Heimese93237d2007-12-19 02:37:44 +00004558 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004559 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004560 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004561 if (res == -1)
4562 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004563 if (outsize<requiredsize)
4564 if (!charmapencode_resize(outobj, outpos, requiredsize))
4565 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004566 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004567 outstart[(*outpos)++] = (char)res;
4568 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004569 }
4570
4571 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004573 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004574 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004575 Py_DECREF(rep);
4576 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004577 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004578 if (PyInt_Check(rep)) {
4579 Py_ssize_t requiredsize = *outpos+1;
4580 if (outsize<requiredsize)
4581 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4582 Py_DECREF(rep);
4583 return enc_EXCEPTION;
4584 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004585 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004586 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004587 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004588 else {
4589 const char *repchars = PyString_AS_STRING(rep);
4590 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4591 Py_ssize_t requiredsize = *outpos+repsize;
4592 if (outsize<requiredsize)
4593 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4594 Py_DECREF(rep);
4595 return enc_EXCEPTION;
4596 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004597 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004598 memcpy(outstart + *outpos, repchars, repsize);
4599 *outpos += repsize;
4600 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004601 }
Georg Brandl9f167602006-06-04 21:46:16 +00004602 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004603 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604}
4605
4606/* handle an error in PyUnicode_EncodeCharmap
4607 Return 0 on success, -1 on error */
4608static
4609int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004610 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004612 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004613 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614{
4615 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004616 Py_ssize_t repsize;
4617 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 Py_UNICODE *uni2;
4619 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004620 Py_ssize_t collstartpos = *inpos;
4621 Py_ssize_t collendpos = *inpos+1;
4622 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 char *encoding = "charmap";
4624 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004625 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627 /* find all unencodable characters */
4628 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004629 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004630 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004631 int res = encoding_map_lookup(p[collendpos], mapping);
4632 if (res != -1)
4633 break;
4634 ++collendpos;
4635 continue;
4636 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004637
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004638 rep = charmapencode_lookup(p[collendpos], mapping);
4639 if (rep==NULL)
4640 return -1;
4641 else if (rep!=Py_None) {
4642 Py_DECREF(rep);
4643 break;
4644 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004645 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004646 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004647 }
4648 /* cache callback name lookup
4649 * (if not done yet, i.e. it's the first error) */
4650 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004651 if ((errors==NULL) || (!strcmp(errors, "strict")))
4652 *known_errorHandler = 1;
4653 else if (!strcmp(errors, "replace"))
4654 *known_errorHandler = 2;
4655 else if (!strcmp(errors, "ignore"))
4656 *known_errorHandler = 3;
4657 else if (!strcmp(errors, "xmlcharrefreplace"))
4658 *known_errorHandler = 4;
4659 else
4660 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661 }
4662 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004663 case 1: /* strict */
4664 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4665 return -1;
4666 case 2: /* replace */
4667 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004668 x = charmapencode_output('?', mapping, res, respos);
4669 if (x==enc_EXCEPTION) {
4670 return -1;
4671 }
4672 else if (x==enc_FAILED) {
4673 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4674 return -1;
4675 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004676 }
4677 /* fall through */
4678 case 3: /* ignore */
4679 *inpos = collendpos;
4680 break;
4681 case 4: /* xmlcharrefreplace */
Serhiy Storchakae822b032013-08-06 16:56:26 +03004682 /* generate replacement */
4683 for (collpos = collstartpos; collpos < collendpos;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004684 char buffer[2+29+1+1];
4685 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03004686 Py_UCS4 ch = p[collpos++];
4687#ifndef Py_UNICODE_WIDE
4688 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4689 (collpos < collendpos) &&
4690 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4691 ch = ((((ch & 0x03FF) << 10) |
4692 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4693 }
4694#endif
4695 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004696 for (cp = buffer; *cp; ++cp) {
4697 x = charmapencode_output(*cp, mapping, res, respos);
4698 if (x==enc_EXCEPTION)
4699 return -1;
4700 else if (x==enc_FAILED) {
4701 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4702 return -1;
4703 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004704 }
4705 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004706 *inpos = collendpos;
4707 break;
4708 default:
4709 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004710 encoding, reason, p, size, exceptionObject,
4711 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004712 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004713 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004714 /* generate replacement */
4715 repsize = PyUnicode_GET_SIZE(repunicode);
4716 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004717 x = charmapencode_output(*uni2, mapping, res, respos);
4718 if (x==enc_EXCEPTION) {
4719 return -1;
4720 }
4721 else if (x==enc_FAILED) {
4722 Py_DECREF(repunicode);
4723 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4724 return -1;
4725 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004726 }
4727 *inpos = newpos;
4728 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004729 }
4730 return 0;
4731}
4732
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004734 Py_ssize_t size,
4735 PyObject *mapping,
4736 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004738 /* output object */
4739 PyObject *res = NULL;
4740 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004741 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004743 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744 PyObject *errorHandler = NULL;
4745 PyObject *exc = NULL;
4746 /* the following variable is used for caching string comparisons
4747 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4748 * 3=ignore, 4=xmlcharrefreplace */
4749 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750
4751 /* Default to Latin-1 */
4752 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004753 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755 /* allocate enough for a simple encoding without
4756 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004757 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 if (res == NULL)
4759 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004760 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004761 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004764 /* try to encode it */
4765 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4766 if (x==enc_EXCEPTION) /* error */
4767 goto onError;
4768 if (x==enc_FAILED) { /* unencodable character */
4769 if (charmap_encoding_error(p, size, &inpos, mapping,
4770 &exc,
4771 &known_errorHandler, &errorHandler, errors,
4772 &res, &respos)) {
4773 goto onError;
4774 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004775 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004776 else
4777 /* done with this character => adjust input position */
4778 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004782 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004783 if (_PyString_Resize(&res, respos))
4784 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004785 }
4786 Py_XDECREF(exc);
4787 Py_XDECREF(errorHandler);
4788 return res;
4789
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004790 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791 Py_XDECREF(res);
4792 Py_XDECREF(exc);
4793 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 return NULL;
4795}
4796
4797PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004798 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799{
4800 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004801 PyErr_BadArgument();
4802 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803 }
4804 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004805 PyUnicode_GET_SIZE(unicode),
4806 mapping,
4807 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808}
4809
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004810/* create or adjust a UnicodeTranslateError */
4811static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004812 const Py_UNICODE *unicode, Py_ssize_t size,
4813 Py_ssize_t startpos, Py_ssize_t endpos,
4814 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004817 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004818 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 }
4820 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004821 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4822 goto onError;
4823 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4824 goto onError;
4825 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4826 goto onError;
4827 return;
4828 onError:
4829 Py_DECREF(*exceptionObject);
4830 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
4832}
4833
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834/* raises a UnicodeTranslateError */
4835static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004836 const Py_UNICODE *unicode, Py_ssize_t size,
4837 Py_ssize_t startpos, Py_ssize_t endpos,
4838 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839{
4840 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004841 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004843 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844}
4845
4846/* error handling callback helper:
4847 build arguments, call the callback and check the arguments,
4848 put the result into newpos and return the replacement string, which
4849 has to be freed by the caller */
4850static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004851 PyObject **errorHandler,
4852 const char *reason,
4853 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4854 Py_ssize_t startpos, Py_ssize_t endpos,
4855 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004857 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858
Martin v. Löwis412fb672006-04-13 06:34:32 +00004859 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 PyObject *restuple;
4861 PyObject *resunicode;
4862
4863 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004864 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004866 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867 }
4868
4869 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004870 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004872 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004873
4874 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004875 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004877 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004879 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004880 Py_DECREF(restuple);
4881 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882 }
4883 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004884 &resunicode, &i_newpos)) {
4885 Py_DECREF(restuple);
4886 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004888 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004889 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004890 else
4891 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004892 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004893 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4894 Py_DECREF(restuple);
4895 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004896 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 Py_INCREF(resunicode);
4898 Py_DECREF(restuple);
4899 return resunicode;
4900}
4901
4902/* Lookup the character ch in the mapping and put the result in result,
4903 which must be decrefed by the caller.
4904 Return 0 on success, -1 on error */
4905static
4906int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4907{
4908 PyObject *w = PyInt_FromLong((long)c);
4909 PyObject *x;
4910
4911 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004912 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004913 x = PyObject_GetItem(mapping, w);
4914 Py_DECREF(w);
4915 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004916 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4917 /* No mapping found means: use 1:1 mapping. */
4918 PyErr_Clear();
4919 *result = NULL;
4920 return 0;
4921 } else
4922 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 }
4924 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004925 *result = x;
4926 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004927 }
4928 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004929 long value = PyInt_AS_LONG(x);
4930 long max = PyUnicode_GetMax();
4931 if (value < 0 || value > max) {
4932 PyErr_Format(PyExc_TypeError,
4933 "character mapping must be in range(0x%lx)", max+1);
4934 Py_DECREF(x);
4935 return -1;
4936 }
4937 *result = x;
4938 return 0;
4939 }
4940 else if (PyUnicode_Check(x)) {
4941 *result = x;
4942 return 0;
4943 }
4944 else {
4945 /* wrong return value */
4946 PyErr_SetString(PyExc_TypeError,
4947 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004948 Py_DECREF(x);
4949 return -1;
4950 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004951}
4952/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004953 if not reallocate and adjust various state variables.
4954 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955static
Walter Dörwald4894c302003-10-24 14:25:28 +00004956int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004957 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004959 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004960 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004961 /* remember old output position */
4962 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4963 /* exponentially overallocate to minimize reallocations */
4964 if (requiredsize < 2 * oldsize)
4965 requiredsize = 2 * oldsize;
4966 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4967 return -1;
4968 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004969 }
4970 return 0;
4971}
4972/* lookup the character, put the result in the output string and adjust
4973 various state variables. Return a new reference to the object that
4974 was put in the output buffer in *result, or Py_None, if the mapping was
4975 undefined (in which case no character was written).
4976 The called must decref result.
4977 Return 0 on success, -1 on error. */
4978static
Walter Dörwald4894c302003-10-24 14:25:28 +00004979int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004980 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4981 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004982{
Walter Dörwald4894c302003-10-24 14:25:28 +00004983 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004984 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004985 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004986 /* not found => default to 1:1 mapping */
4987 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988 }
4989 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004990 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004991 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004992 /* no overflow check, because we know that the space is enough */
4993 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 }
4995 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004996 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4997 if (repsize==1) {
4998 /* no overflow check, because we know that the space is enough */
4999 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5000 }
5001 else if (repsize!=0) {
5002 /* more than one character */
5003 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5004 (insize - (curinp-startinp)) +
5005 repsize - 1;
5006 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5007 return -1;
5008 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5009 *outp += repsize;
5010 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011 }
5012 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005013 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005014 return 0;
5015}
5016
5017PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005018 Py_ssize_t size,
5019 PyObject *mapping,
5020 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 /* output object */
5023 PyObject *res = NULL;
5024 /* pointers to the beginning and end+1 of input */
5025 const Py_UNICODE *startp = p;
5026 const Py_UNICODE *endp = p + size;
5027 /* pointer into the output */
5028 Py_UNICODE *str;
5029 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005030 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 char *reason = "character maps to <undefined>";
5032 PyObject *errorHandler = NULL;
5033 PyObject *exc = NULL;
5034 /* the following variable is used for caching string comparisons
5035 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5036 * 3=ignore, 4=xmlcharrefreplace */
5037 int known_errorHandler = -1;
5038
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005040 PyErr_BadArgument();
5041 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005043
5044 /* allocate enough for a simple 1:1 translation without
5045 replacements, if we need more, we'll resize */
5046 res = PyUnicode_FromUnicode(NULL, size);
5047 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005048 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005050 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005051 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005053 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005054 /* try to encode it */
5055 PyObject *x = NULL;
5056 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5057 Py_XDECREF(x);
5058 goto onError;
5059 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005060 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005061 if (x!=Py_None) /* it worked => adjust input pointer */
5062 ++p;
5063 else { /* untranslatable character */
5064 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5065 Py_ssize_t repsize;
5066 Py_ssize_t newpos;
5067 Py_UNICODE *uni2;
5068 /* startpos for collecting untranslatable chars */
5069 const Py_UNICODE *collstart = p;
5070 const Py_UNICODE *collend = p+1;
5071 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005073 /* find all untranslatable characters */
5074 while (collend < endp) {
5075 if (charmaptranslate_lookup(*collend, mapping, &x))
5076 goto onError;
5077 Py_XDECREF(x);
5078 if (x!=Py_None)
5079 break;
5080 ++collend;
5081 }
5082 /* cache callback name lookup
5083 * (if not done yet, i.e. it's the first error) */
5084 if (known_errorHandler==-1) {
5085 if ((errors==NULL) || (!strcmp(errors, "strict")))
5086 known_errorHandler = 1;
5087 else if (!strcmp(errors, "replace"))
5088 known_errorHandler = 2;
5089 else if (!strcmp(errors, "ignore"))
5090 known_errorHandler = 3;
5091 else if (!strcmp(errors, "xmlcharrefreplace"))
5092 known_errorHandler = 4;
5093 else
5094 known_errorHandler = 0;
5095 }
5096 switch (known_errorHandler) {
5097 case 1: /* strict */
5098 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005099 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005100 case 2: /* replace */
5101 /* No need to check for space, this is a 1:1 replacement */
5102 for (coll = collstart; coll<collend; ++coll)
5103 *str++ = '?';
5104 /* fall through */
5105 case 3: /* ignore */
5106 p = collend;
5107 break;
5108 case 4: /* xmlcharrefreplace */
5109 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005110 for (p = collstart; p < collend;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005111 char buffer[2+29+1+1];
5112 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03005113 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5114 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005115 if (charmaptranslate_makespace(&res, &str,
5116 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5117 goto onError;
5118 for (cp = buffer; *cp; ++cp)
5119 *str++ = *cp;
5120 }
5121 p = collend;
5122 break;
5123 default:
5124 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5125 reason, startp, size, &exc,
5126 collstart-startp, collend-startp, &newpos);
5127 if (repunicode == NULL)
5128 goto onError;
5129 /* generate replacement */
5130 repsize = PyUnicode_GET_SIZE(repunicode);
5131 if (charmaptranslate_makespace(&res, &str,
5132 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5133 Py_DECREF(repunicode);
5134 goto onError;
5135 }
5136 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5137 *str++ = *uni2;
5138 p = startp + newpos;
5139 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005140 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005141 }
5142 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005143 /* Resize if we allocated to much */
5144 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005145 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005146 if (PyUnicode_Resize(&res, respos) < 0)
5147 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005148 }
5149 Py_XDECREF(exc);
5150 Py_XDECREF(errorHandler);
5151 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005153 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005154 Py_XDECREF(res);
5155 Py_XDECREF(exc);
5156 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 return NULL;
5158}
5159
5160PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005161 PyObject *mapping,
5162 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163{
5164 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005165
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 str = PyUnicode_FromObject(str);
5167 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005168 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005170 PyUnicode_GET_SIZE(str),
5171 mapping,
5172 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 Py_DECREF(str);
5174 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005175
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005176 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 Py_XDECREF(str);
5178 return NULL;
5179}
Tim Petersced69f82003-09-16 20:30:58 +00005180
Guido van Rossum9e896b32000-04-05 20:11:21 +00005181/* --- Decimal Encoder ---------------------------------------------------- */
5182
5183int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005184 Py_ssize_t length,
5185 char *output,
5186 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005187{
5188 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005189 PyObject *errorHandler = NULL;
5190 PyObject *exc = NULL;
5191 const char *encoding = "decimal";
5192 const char *reason = "invalid decimal Unicode string";
5193 /* the following variable is used for caching string comparisons
5194 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5195 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005196
5197 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005198 PyErr_BadArgument();
5199 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005200 }
5201
5202 p = s;
5203 end = s + length;
5204 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005205 register Py_UNICODE ch = *p;
5206 int decimal;
5207 PyObject *repunicode;
5208 Py_ssize_t repsize;
5209 Py_ssize_t newpos;
5210 Py_UNICODE *uni2;
5211 Py_UNICODE *collstart;
5212 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005213
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005214 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005215 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005216 ++p;
5217 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005218 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005219 decimal = Py_UNICODE_TODECIMAL(ch);
5220 if (decimal >= 0) {
5221 *output++ = '0' + decimal;
5222 ++p;
5223 continue;
5224 }
5225 if (0 < ch && ch < 256) {
5226 *output++ = (char)ch;
5227 ++p;
5228 continue;
5229 }
5230 /* All other characters are considered unencodable */
5231 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005232 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005233 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005234 Py_UNICODE_ISSPACE(*collend) ||
5235 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005236 break;
5237 }
5238 /* cache callback name lookup
5239 * (if not done yet, i.e. it's the first error) */
5240 if (known_errorHandler==-1) {
5241 if ((errors==NULL) || (!strcmp(errors, "strict")))
5242 known_errorHandler = 1;
5243 else if (!strcmp(errors, "replace"))
5244 known_errorHandler = 2;
5245 else if (!strcmp(errors, "ignore"))
5246 known_errorHandler = 3;
5247 else if (!strcmp(errors, "xmlcharrefreplace"))
5248 known_errorHandler = 4;
5249 else
5250 known_errorHandler = 0;
5251 }
5252 switch (known_errorHandler) {
5253 case 1: /* strict */
5254 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5255 goto onError;
5256 case 2: /* replace */
5257 for (p = collstart; p < collend; ++p)
5258 *output++ = '?';
5259 /* fall through */
5260 case 3: /* ignore */
5261 p = collend;
5262 break;
5263 case 4: /* xmlcharrefreplace */
5264 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005265 for (p = collstart; p < collend;) {
5266 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5267 output += sprintf(output, "&#%d;", ch);
5268 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005269 p = collend;
5270 break;
5271 default:
5272 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5273 encoding, reason, s, length, &exc,
5274 collstart-s, collend-s, &newpos);
5275 if (repunicode == NULL)
5276 goto onError;
5277 /* generate replacement */
5278 repsize = PyUnicode_GET_SIZE(repunicode);
5279 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5280 Py_UNICODE ch = *uni2;
5281 if (Py_UNICODE_ISSPACE(ch))
5282 *output++ = ' ';
5283 else {
5284 decimal = Py_UNICODE_TODECIMAL(ch);
5285 if (decimal >= 0)
5286 *output++ = '0' + decimal;
5287 else if (0 < ch && ch < 256)
5288 *output++ = (char)ch;
5289 else {
5290 Py_DECREF(repunicode);
5291 raise_encode_exception(&exc, encoding,
5292 s, length, collstart-s, collend-s, reason);
5293 goto onError;
5294 }
5295 }
5296 }
5297 p = s + newpos;
5298 Py_DECREF(repunicode);
5299 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005300 }
5301 /* 0-terminate the output string */
5302 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005303 Py_XDECREF(exc);
5304 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005305 return 0;
5306
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005307 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005308 Py_XDECREF(exc);
5309 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005310 return -1;
5311}
5312
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313/* --- Helpers ------------------------------------------------------------ */
5314
Eric Smitha9f7d622008-02-17 19:46:49 +00005315#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005316#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005317
5318#include "stringlib/count.h"
5319#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005320#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005321#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005322
Fredrik Lundhc8162812006-05-26 19:33:03 +00005323/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005324#define ADJUST_INDICES(start, end, len) \
5325 if (end > len) \
5326 end = len; \
5327 else if (end < 0) { \
5328 end += len; \
5329 if (end < 0) \
5330 end = 0; \
5331 } \
5332 if (start < 0) { \
5333 start += len; \
5334 if (start < 0) \
5335 start = 0; \
5336 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005337
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005339 PyObject *substr,
5340 Py_ssize_t start,
5341 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005343 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005344 PyUnicodeObject* str_obj;
5345 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005346
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005347 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5348 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005349 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005350 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5351 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005352 Py_DECREF(str_obj);
5353 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 }
Tim Petersced69f82003-09-16 20:30:58 +00005355
Antoine Pitrou64672132010-01-13 07:55:48 +00005356 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005357 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005358 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5359 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005360 );
5361
5362 Py_DECREF(sub_obj);
5363 Py_DECREF(str_obj);
5364
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 return result;
5366}
5367
Martin v. Löwis18e16552006-02-15 17:27:45 +00005368Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005369 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005370 Py_ssize_t start,
5371 Py_ssize_t end,
5372 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005374 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005375
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005376 str = PyUnicode_FromObject(str);
5377 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005378 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005379 sub = PyUnicode_FromObject(sub);
5380 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005381 Py_DECREF(str);
5382 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 }
Tim Petersced69f82003-09-16 20:30:58 +00005384
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005385 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005386 result = stringlib_find_slice(
5387 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5388 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5389 start, end
5390 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005391 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005392 result = stringlib_rfind_slice(
5393 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5394 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5395 start, end
5396 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005397
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005398 Py_DECREF(str);
5399 Py_DECREF(sub);
5400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 return result;
5402}
5403
Tim Petersced69f82003-09-16 20:30:58 +00005404static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005406 PyUnicodeObject *substring,
5407 Py_ssize_t start,
5408 Py_ssize_t end,
5409 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 if (substring->length == 0)
5412 return 1;
5413
Antoine Pitrou64672132010-01-13 07:55:48 +00005414 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 end -= substring->length;
5416 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005417 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418
5419 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005420 if (Py_UNICODE_MATCH(self, end, substring))
5421 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 } else {
5423 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005424 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 }
5426
5427 return 0;
5428}
5429
Martin v. Löwis18e16552006-02-15 17:27:45 +00005430Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005431 PyObject *substr,
5432 Py_ssize_t start,
5433 Py_ssize_t end,
5434 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005436 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005437
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 str = PyUnicode_FromObject(str);
5439 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005440 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 substr = PyUnicode_FromObject(substr);
5442 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005443 Py_DECREF(str);
5444 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 }
Tim Petersced69f82003-09-16 20:30:58 +00005446
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005448 (PyUnicodeObject *)substr,
5449 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 Py_DECREF(str);
5451 Py_DECREF(substr);
5452 return result;
5453}
5454
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455/* Apply fixfct filter to the Unicode object self and return a
5456 reference to the modified object */
5457
Tim Petersced69f82003-09-16 20:30:58 +00005458static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005460 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461{
5462
5463 PyUnicodeObject *u;
5464
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005465 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005467 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005468
5469 Py_UNICODE_COPY(u->str, self->str, self->length);
5470
Tim Peters7a29bd52001-09-12 03:03:31 +00005471 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005472 /* fixfct should return TRUE if it modified the buffer. If
5473 FALSE, return a reference to the original buffer instead
5474 (to save space, not time) */
5475 Py_INCREF(self);
5476 Py_DECREF(u);
5477 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 }
5479 return (PyObject*) u;
5480}
5481
Tim Petersced69f82003-09-16 20:30:58 +00005482static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483int fixupper(PyUnicodeObject *self)
5484{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005485 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 Py_UNICODE *s = self->str;
5487 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005488
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005490 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005491
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005492 ch = Py_UNICODE_TOUPPER(*s);
5493 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005495 *s = ch;
5496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 s++;
5498 }
5499
5500 return status;
5501}
5502
Tim Petersced69f82003-09-16 20:30:58 +00005503static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504int fixlower(PyUnicodeObject *self)
5505{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005506 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 Py_UNICODE *s = self->str;
5508 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005509
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005511 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005512
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005513 ch = Py_UNICODE_TOLOWER(*s);
5514 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005516 *s = ch;
5517 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518 s++;
5519 }
5520
5521 return status;
5522}
5523
Tim Petersced69f82003-09-16 20:30:58 +00005524static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525int fixswapcase(PyUnicodeObject *self)
5526{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005527 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 Py_UNICODE *s = self->str;
5529 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005530
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 while (len-- > 0) {
5532 if (Py_UNICODE_ISUPPER(*s)) {
5533 *s = Py_UNICODE_TOLOWER(*s);
5534 status = 1;
5535 } else if (Py_UNICODE_ISLOWER(*s)) {
5536 *s = Py_UNICODE_TOUPPER(*s);
5537 status = 1;
5538 }
5539 s++;
5540 }
5541
5542 return status;
5543}
5544
Tim Petersced69f82003-09-16 20:30:58 +00005545static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546int fixcapitalize(PyUnicodeObject *self)
5547{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005548 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005549 Py_UNICODE *s = self->str;
5550 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005551
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005552 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005553 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005554 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005555 *s = Py_UNICODE_TOUPPER(*s);
5556 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005558 s++;
5559 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005560 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005561 *s = Py_UNICODE_TOLOWER(*s);
5562 status = 1;
5563 }
5564 s++;
5565 }
5566 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567}
5568
5569static
5570int fixtitle(PyUnicodeObject *self)
5571{
5572 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5573 register Py_UNICODE *e;
5574 int previous_is_cased;
5575
5576 /* Shortcut for single character strings */
5577 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005578 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5579 if (*p != ch) {
5580 *p = ch;
5581 return 1;
5582 }
5583 else
5584 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 }
Tim Petersced69f82003-09-16 20:30:58 +00005586
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 e = p + PyUnicode_GET_SIZE(self);
5588 previous_is_cased = 0;
5589 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005590 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005591
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005592 if (previous_is_cased)
5593 *p = Py_UNICODE_TOLOWER(ch);
5594 else
5595 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005596
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005597 if (Py_UNICODE_ISLOWER(ch) ||
5598 Py_UNICODE_ISUPPER(ch) ||
5599 Py_UNICODE_ISTITLE(ch))
5600 previous_is_cased = 1;
5601 else
5602 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 }
5604 return 1;
5605}
5606
Tim Peters8ce9f162004-08-27 01:49:32 +00005607PyObject *
5608PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609{
Tim Peters8ce9f162004-08-27 01:49:32 +00005610 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005611 const Py_UNICODE blank = ' ';
5612 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005613 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005614 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005615 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5616 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005617 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5618 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005619 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005620 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005621 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622
Tim Peters05eba1f2004-08-27 21:32:02 +00005623 fseq = PySequence_Fast(seq, "");
5624 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005625 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005626 }
5627
Tim Peters91879ab2004-08-27 22:35:44 +00005628 /* Grrrr. A codec may be invoked to convert str objects to
5629 * Unicode, and so it's possible to call back into Python code
5630 * during PyUnicode_FromObject(), and so it's possible for a sick
5631 * codec to change the size of fseq (if seq is a list). Therefore
5632 * we have to keep refetching the size -- can't assume seqlen
5633 * is invariant.
5634 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005635 seqlen = PySequence_Fast_GET_SIZE(fseq);
5636 /* If empty sequence, return u"". */
5637 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005638 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5639 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005640 }
5641 /* If singleton sequence with an exact Unicode, return that. */
5642 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005643 item = PySequence_Fast_GET_ITEM(fseq, 0);
5644 if (PyUnicode_CheckExact(item)) {
5645 Py_INCREF(item);
5646 res = (PyUnicodeObject *)item;
5647 goto Done;
5648 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005649 }
5650
Tim Peters05eba1f2004-08-27 21:32:02 +00005651 /* At least two items to join, or one that isn't exact Unicode. */
5652 if (seqlen > 1) {
5653 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005654 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005655 sep = &blank;
5656 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005657 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005658 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005659 internal_separator = PyUnicode_FromObject(separator);
5660 if (internal_separator == NULL)
5661 goto onError;
5662 sep = PyUnicode_AS_UNICODE(internal_separator);
5663 seplen = PyUnicode_GET_SIZE(internal_separator);
5664 /* In case PyUnicode_FromObject() mutated seq. */
5665 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005666 }
5667 }
5668
5669 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005670 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005671 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005672 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005673 res_p = PyUnicode_AS_UNICODE(res);
5674 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005675
Tim Peters05eba1f2004-08-27 21:32:02 +00005676 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005677 Py_ssize_t itemlen;
5678 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005679
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005680 item = PySequence_Fast_GET_ITEM(fseq, i);
5681 /* Convert item to Unicode. */
5682 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5683 PyErr_Format(PyExc_TypeError,
5684 "sequence item %zd: expected string or Unicode,"
5685 " %.80s found",
5686 i, Py_TYPE(item)->tp_name);
5687 goto onError;
5688 }
5689 item = PyUnicode_FromObject(item);
5690 if (item == NULL)
5691 goto onError;
5692 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005693
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005694 /* In case PyUnicode_FromObject() mutated seq. */
5695 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005696
Tim Peters8ce9f162004-08-27 01:49:32 +00005697 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005698 itemlen = PyUnicode_GET_SIZE(item);
5699 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005700 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005701 goto Overflow;
5702 if (i < seqlen - 1) {
5703 new_res_used += seplen;
5704 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005705 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005706 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005707 if (new_res_used > res_alloc) {
5708 /* double allocated size until it's big enough */
5709 do {
5710 res_alloc += res_alloc;
5711 if (res_alloc <= 0)
5712 goto Overflow;
5713 } while (new_res_used > res_alloc);
5714 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5715 Py_DECREF(item);
5716 goto onError;
5717 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005718 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005719 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005720
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005721 /* Copy item, and maybe the separator. */
5722 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5723 res_p += itemlen;
5724 if (i < seqlen - 1) {
5725 Py_UNICODE_COPY(res_p, sep, seplen);
5726 res_p += seplen;
5727 }
5728 Py_DECREF(item);
5729 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005730 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005731
Tim Peters05eba1f2004-08-27 21:32:02 +00005732 /* Shrink res to match the used area; this probably can't fail,
5733 * but it's cheap to check.
5734 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005735 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005736 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005737
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005738 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005739 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005740 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 return (PyObject *)res;
5742
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005743 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005744 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005745 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005746 Py_DECREF(item);
5747 /* fall through */
5748
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005749 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005750 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005751 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005752 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 return NULL;
5754}
5755
Tim Petersced69f82003-09-16 20:30:58 +00005756static
5757PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005758 Py_ssize_t left,
5759 Py_ssize_t right,
5760 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761{
5762 PyUnicodeObject *u;
5763
5764 if (left < 0)
5765 left = 0;
5766 if (right < 0)
5767 right = 0;
5768
Tim Peters7a29bd52001-09-12 03:03:31 +00005769 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 Py_INCREF(self);
5771 return self;
5772 }
5773
Neal Norwitze7d8be82008-07-31 17:17:14 +00005774 if (left > PY_SSIZE_T_MAX - self->length ||
5775 right > PY_SSIZE_T_MAX - (left + self->length)) {
5776 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5777 return NULL;
5778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 u = _PyUnicode_New(left + self->length + right);
5780 if (u) {
5781 if (left)
5782 Py_UNICODE_FILL(u->str, fill, left);
5783 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5784 if (right)
5785 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5786 }
5787
5788 return u;
5789}
5790
Antoine Pitrou64672132010-01-13 07:55:48 +00005791PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794
5795 string = PyUnicode_FromObject(string);
5796 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005797 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798
Antoine Pitrou64672132010-01-13 07:55:48 +00005799 list = stringlib_splitlines(
5800 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5801 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802
5803 Py_DECREF(string);
5804 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805}
5806
Tim Petersced69f82003-09-16 20:30:58 +00005807static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005809 PyUnicodeObject *substring,
5810 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005813 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005816 return stringlib_split_whitespace(
5817 (PyObject*) self, self->str, self->length, maxcount
5818 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819
Antoine Pitrou64672132010-01-13 07:55:48 +00005820 return stringlib_split(
5821 (PyObject*) self, self->str, self->length,
5822 substring->str, substring->length,
5823 maxcount
5824 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825}
5826
Tim Petersced69f82003-09-16 20:30:58 +00005827static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005828PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005829 PyUnicodeObject *substring,
5830 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005831{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005832 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005833 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005834
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005835 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005836 return stringlib_rsplit_whitespace(
5837 (PyObject*) self, self->str, self->length, maxcount
5838 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005839
Antoine Pitrou64672132010-01-13 07:55:48 +00005840 return stringlib_rsplit(
5841 (PyObject*) self, self->str, self->length,
5842 substring->str, substring->length,
5843 maxcount
5844 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005845}
5846
5847static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005849 PyUnicodeObject *str1,
5850 PyUnicodeObject *str2,
5851 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852{
5853 PyUnicodeObject *u;
5854
5855 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005856 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005857 else if (maxcount == 0 || self->length == 0)
5858 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859
Fredrik Lundh347ee272006-05-24 16:35:18 +00005860 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005861 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005862 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005863 if (str1->length == 0)
5864 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005865 if (str1->length == 1) {
5866 /* replace characters */
5867 Py_UNICODE u1, u2;
5868 if (!findchar(self->str, self->length, str1->str[0]))
5869 goto nothing;
5870 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5871 if (!u)
5872 return NULL;
5873 Py_UNICODE_COPY(u->str, self->str, self->length);
5874 u1 = str1->str[0];
5875 u2 = str2->str[0];
5876 for (i = 0; i < u->length; i++)
5877 if (u->str[i] == u1) {
5878 if (--maxcount < 0)
5879 break;
5880 u->str[i] = u2;
5881 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005883 i = stringlib_find(
5884 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005886 if (i < 0)
5887 goto nothing;
5888 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5889 if (!u)
5890 return NULL;
5891 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005892
5893 /* change everything in-place, starting with this one */
5894 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5895 i += str1->length;
5896
5897 while ( --maxcount > 0) {
5898 i = stringlib_find(self->str+i, self->length-i,
5899 str1->str, str1->length,
5900 i);
5901 if (i == -1)
5902 break;
5903 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5904 i += str1->length;
5905 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005908
Brett Cannona7f13ee2010-05-04 01:16:51 +00005909 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005910 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 Py_UNICODE *p;
5912
5913 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005914 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5915 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005916 if (n == 0)
5917 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005918 /* new_size = self->length + n * (str2->length - str1->length)); */
5919 delta = (str2->length - str1->length);
5920 if (delta == 0) {
5921 new_size = self->length;
5922 } else {
5923 product = n * (str2->length - str1->length);
5924 if ((product / (str2->length - str1->length)) != n) {
5925 PyErr_SetString(PyExc_OverflowError,
5926 "replace string is too long");
5927 return NULL;
5928 }
5929 new_size = self->length + product;
5930 if (new_size < 0) {
5931 PyErr_SetString(PyExc_OverflowError,
5932 "replace string is too long");
5933 return NULL;
5934 }
5935 }
5936 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005937 if (!u)
5938 return NULL;
5939 i = 0;
5940 p = u->str;
5941 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005942 while (n-- > 0) {
5943 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005944 j = stringlib_find(self->str+i, self->length-i,
5945 str1->str, str1->length,
5946 i);
5947 if (j == -1)
5948 break;
5949 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005950 /* copy unchanged part [i:j] */
5951 Py_UNICODE_COPY(p, self->str+i, j-i);
5952 p += j - i;
5953 }
5954 /* copy substitution string */
5955 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005956 Py_UNICODE_COPY(p, str2->str, str2->length);
5957 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005958 }
5959 i = j + str1->length;
5960 }
5961 if (i < self->length)
5962 /* copy tail [i:] */
5963 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005964 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005965 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005966 while (n > 0) {
5967 Py_UNICODE_COPY(p, str2->str, str2->length);
5968 p += str2->length;
5969 if (--n <= 0)
5970 break;
5971 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005973 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 }
5975 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005977
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005978 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005979 /* nothing to replace; return original string (when possible) */
5980 if (PyUnicode_CheckExact(self)) {
5981 Py_INCREF(self);
5982 return (PyObject *) self;
5983 }
5984 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985}
5986
5987/* --- Unicode Object Methods --------------------------------------------- */
5988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005989PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005990 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991\n\
5992Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005993characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994
5995static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005996unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 return fixup(self, fixtitle);
5999}
6000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006001PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006002 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003\n\
6004Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00006005have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006
6007static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006008unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 return fixup(self, fixcapitalize);
6011}
6012
6013#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006014PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006015 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016\n\
6017Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006018normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
6020static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006021unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022{
6023 PyObject *list;
6024 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006025 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 /* Split into words */
6028 list = split(self, NULL, -1);
6029 if (!list)
6030 return NULL;
6031
6032 /* Capitalize each word */
6033 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6034 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006035 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 if (item == NULL)
6037 goto onError;
6038 Py_DECREF(PyList_GET_ITEM(list, i));
6039 PyList_SET_ITEM(list, i, item);
6040 }
6041
6042 /* Join the words to form a new string */
6043 item = PyUnicode_Join(NULL, list);
6044
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006045 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 Py_DECREF(list);
6047 return (PyObject *)item;
6048}
6049#endif
6050
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006051/* Argument converter. Coerces to a single unicode character */
6052
6053static int
6054convert_uc(PyObject *obj, void *addr)
6055{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006056 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6057 PyObject *uniobj;
6058 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006059
Benjamin Peterson857ce152009-01-31 16:29:18 +00006060 uniobj = PyUnicode_FromObject(obj);
6061 if (uniobj == NULL) {
6062 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006063 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006064 return 0;
6065 }
6066 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6067 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006068 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006069 Py_DECREF(uniobj);
6070 return 0;
6071 }
6072 unistr = PyUnicode_AS_UNICODE(uniobj);
6073 *fillcharloc = unistr[0];
6074 Py_DECREF(uniobj);
6075 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006076}
6077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006078PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006079 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006081Return S centered in a Unicode string of length width. Padding is\n\
6082done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083
6084static PyObject *
6085unicode_center(PyUnicodeObject *self, PyObject *args)
6086{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006087 Py_ssize_t marg, left;
6088 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006089 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090
Thomas Woutersde017742006-02-16 19:34:37 +00006091 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 return NULL;
6093
Tim Peters7a29bd52001-09-12 03:03:31 +00006094 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 Py_INCREF(self);
6096 return (PyObject*) self;
6097 }
6098
6099 marg = width - self->length;
6100 left = marg / 2 + (marg & width & 1);
6101
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006102 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103}
6104
Marc-André Lemburge5034372000-08-08 08:04:29 +00006105#if 0
6106
6107/* This code should go into some future Unicode collation support
6108 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006109 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006110
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006111/* speedy UTF-16 code point order comparison */
6112/* gleaned from: */
6113/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6114
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006115static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006116{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006117 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006118 0, 0, 0, 0, 0, 0, 0, 0,
6119 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006120 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006121};
6122
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123static int
6124unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6125{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006126 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006127
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 Py_UNICODE *s1 = str1->str;
6129 Py_UNICODE *s2 = str2->str;
6130
6131 len1 = str1->length;
6132 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006133
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006135 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006136
6137 c1 = *s1++;
6138 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006139
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006140 if (c1 > (1<<11) * 26)
6141 c1 += utf16Fixup[c1>>11];
6142 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006143 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006144 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006145
6146 if (c1 != c2)
6147 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006148
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006149 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 }
6151
6152 return (len1 < len2) ? -1 : (len1 != len2);
6153}
6154
Marc-André Lemburge5034372000-08-08 08:04:29 +00006155#else
6156
6157static int
6158unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6159{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006160 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006161
6162 Py_UNICODE *s1 = str1->str;
6163 Py_UNICODE *s2 = str2->str;
6164
6165 len1 = str1->length;
6166 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006167
Marc-André Lemburge5034372000-08-08 08:04:29 +00006168 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006169 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006170
Fredrik Lundh45714e92001-06-26 16:39:36 +00006171 c1 = *s1++;
6172 c2 = *s2++;
6173
6174 if (c1 != c2)
6175 return (c1 < c2) ? -1 : 1;
6176
Marc-André Lemburge5034372000-08-08 08:04:29 +00006177 len1--; len2--;
6178 }
6179
6180 return (len1 < len2) ? -1 : (len1 != len2);
6181}
6182
6183#endif
6184
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006186 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187{
6188 PyUnicodeObject *u = NULL, *v = NULL;
6189 int result;
6190
6191 /* Coerce the two arguments */
6192 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6193 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006194 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6196 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006197 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198
Thomas Wouters7e474022000-07-16 12:04:32 +00006199 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006201 Py_DECREF(u);
6202 Py_DECREF(v);
6203 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 }
6205
6206 result = unicode_compare(u, v);
6207
6208 Py_DECREF(u);
6209 Py_DECREF(v);
6210 return result;
6211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006212 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 Py_XDECREF(u);
6214 Py_XDECREF(v);
6215 return -1;
6216}
6217
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006218PyObject *PyUnicode_RichCompare(PyObject *left,
6219 PyObject *right,
6220 int op)
6221{
6222 int result;
6223
6224 result = PyUnicode_Compare(left, right);
6225 if (result == -1 && PyErr_Occurred())
6226 goto onError;
6227
6228 /* Convert the return value to a Boolean */
6229 switch (op) {
6230 case Py_EQ:
6231 result = (result == 0);
6232 break;
6233 case Py_NE:
6234 result = (result != 0);
6235 break;
6236 case Py_LE:
6237 result = (result <= 0);
6238 break;
6239 case Py_GE:
6240 result = (result >= 0);
6241 break;
6242 case Py_LT:
6243 result = (result == -1);
6244 break;
6245 case Py_GT:
6246 result = (result == 1);
6247 break;
6248 }
6249 return PyBool_FromLong(result);
6250
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006251 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006252
6253 /* Standard case
6254
6255 Type errors mean that PyUnicode_FromObject() could not convert
6256 one of the arguments (usually the right hand side) to Unicode,
6257 ie. we can't handle the comparison request. However, it is
6258 possible that the other object knows a comparison method, which
6259 is why we return Py_NotImplemented to give the other object a
6260 chance.
6261
6262 */
6263 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6264 PyErr_Clear();
6265 Py_INCREF(Py_NotImplemented);
6266 return Py_NotImplemented;
6267 }
6268 if (op != Py_EQ && op != Py_NE)
6269 return NULL;
6270
6271 /* Equality comparison.
6272
6273 This is a special case: we silence any PyExc_UnicodeDecodeError
6274 and instead turn it into a PyErr_UnicodeWarning.
6275
6276 */
6277 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6278 return NULL;
6279 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006280 if (PyErr_Warn(PyExc_UnicodeWarning,
6281 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006282 "Unicode equal comparison "
6283 "failed to convert both arguments to Unicode - "
6284 "interpreting them as being unequal" :
6285 "Unicode unequal comparison "
6286 "failed to convert both arguments to Unicode - "
6287 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006288 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006289 return NULL;
6290 result = (op == Py_NE);
6291 return PyBool_FromLong(result);
6292}
6293
Guido van Rossum403d68b2000-03-13 15:55:09 +00006294int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006295 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006296{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006297 PyObject *str, *sub;
6298 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006299
6300 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006301 sub = PyUnicode_FromObject(element);
6302 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006303 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006304 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006305
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006306 str = PyUnicode_FromObject(container);
6307 if (!str) {
6308 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006309 return -1;
6310 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006311
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006312 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006313
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006314 Py_DECREF(str);
6315 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006316
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006317 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006318}
6319
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320/* Concat to string or Unicode object giving a new Unicode object. */
6321
6322PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006323 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324{
6325 PyUnicodeObject *u = NULL, *v = NULL, *w;
6326
6327 /* Coerce the two arguments */
6328 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6329 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006330 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6332 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006333 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334
6335 /* Shortcuts */
6336 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006337 Py_DECREF(v);
6338 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 }
6340 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006341 Py_DECREF(u);
6342 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 }
6344
6345 /* Concat the two Unicode strings */
6346 w = _PyUnicode_New(u->length + v->length);
6347 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006348 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 Py_UNICODE_COPY(w->str, u->str, u->length);
6350 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6351
6352 Py_DECREF(u);
6353 Py_DECREF(v);
6354 return (PyObject *)w;
6355
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006356 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 Py_XDECREF(u);
6358 Py_XDECREF(v);
6359 return NULL;
6360}
6361
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006362PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006363 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006365Return the number of non-overlapping occurrences of substring sub in\n\
6366Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006367interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368
6369static PyObject *
6370unicode_count(PyUnicodeObject *self, PyObject *args)
6371{
6372 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006373 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006374 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 PyObject *result;
6376
Jesus Cea44e81682011-04-20 16:39:15 +02006377 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6378 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006379 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006380
Antoine Pitrou64672132010-01-13 07:55:48 +00006381 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006382 result = PyInt_FromSsize_t(
6383 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006384 substring->str, substring->length,
6385 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006386 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387
6388 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006389
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 return result;
6391}
6392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006393PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006394 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006396Encodes S using the codec registered for encoding. encoding defaults\n\
6397to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006398handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6400'xmlcharrefreplace' as well as any other name registered with\n\
6401codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402
6403static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006404unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006406 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407 char *encoding = NULL;
6408 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006409 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006410
Benjamin Peterson332d7212009-09-18 21:14:55 +00006411 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6412 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006414 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006415 if (v == NULL)
6416 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006417 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006418 PyErr_Format(PyExc_TypeError,
6419 "encoder did not return a string/unicode object "
6420 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006421 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006422 Py_DECREF(v);
6423 return NULL;
6424 }
6425 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006426
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006427 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006428 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006429}
6430
6431PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006432 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006433\n\
6434Decodes S using the codec registered for encoding. encoding defaults\n\
6435to the default encoding. errors may be given to set a different error\n\
6436handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6437a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006438as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006439able to handle UnicodeDecodeErrors.");
6440
6441static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006442unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006443{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006444 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006445 char *encoding = NULL;
6446 char *errors = NULL;
6447 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006448
Benjamin Peterson332d7212009-09-18 21:14:55 +00006449 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6450 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006451 return NULL;
6452 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006453 if (v == NULL)
6454 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006455 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006456 PyErr_Format(PyExc_TypeError,
6457 "decoder did not return a string/unicode object "
6458 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006459 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006460 Py_DECREF(v);
6461 return NULL;
6462 }
6463 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006464
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006465 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006466 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467}
6468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006469PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006470 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471\n\
6472Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006473If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474
6475static PyObject*
6476unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6477{
6478 Py_UNICODE *e;
6479 Py_UNICODE *p;
6480 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006481 Py_UNICODE *qe;
6482 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 PyUnicodeObject *u;
6484 int tabsize = 8;
6485
6486 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488
Thomas Wouters7e474022000-07-16 12:04:32 +00006489 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006490 i = 0; /* chars up to and including most recent \n or \r */
6491 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6492 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 for (p = self->str; p < e; p++)
6494 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006495 if (tabsize > 0) {
6496 incr = tabsize - (j % tabsize); /* cannot overflow */
6497 if (j > PY_SSIZE_T_MAX - incr)
6498 goto overflow1;
6499 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006500 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006501 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006503 if (j > PY_SSIZE_T_MAX - 1)
6504 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 j++;
6506 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006507 if (i > PY_SSIZE_T_MAX - j)
6508 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006510 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 }
6512 }
6513
Guido van Rossum5bdff602008-03-11 21:18:06 +00006514 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006515 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006516
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 /* Second pass: create output string and fill it */
6518 u = _PyUnicode_New(i + j);
6519 if (!u)
6520 return NULL;
6521
Guido van Rossum5bdff602008-03-11 21:18:06 +00006522 j = 0; /* same as in first pass */
6523 q = u->str; /* next output char */
6524 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525
6526 for (p = self->str; p < e; p++)
6527 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006528 if (tabsize > 0) {
6529 i = tabsize - (j % tabsize);
6530 j += i;
6531 while (i--) {
6532 if (q >= qe)
6533 goto overflow2;
6534 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006535 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006536 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006537 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006538 else {
6539 if (q >= qe)
6540 goto overflow2;
6541 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006542 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 if (*p == '\n' || *p == '\r')
6544 j = 0;
6545 }
6546
6547 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006548
6549 overflow2:
6550 Py_DECREF(u);
6551 overflow1:
6552 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6553 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554}
6555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006556PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006557 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558\n\
6559Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006560such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561arguments start and end are interpreted as in slice notation.\n\
6562\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006563Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564
6565static PyObject *
6566unicode_find(PyUnicodeObject *self, PyObject *args)
6567{
Jesus Cea44e81682011-04-20 16:39:15 +02006568 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006569 Py_ssize_t start;
6570 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006571 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
Jesus Cea44e81682011-04-20 16:39:15 +02006573 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6574 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006577 result = stringlib_find_slice(
6578 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6579 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6580 start, end
6581 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582
6583 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006584
6585 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586}
6587
6588static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006589unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590{
6591 if (index < 0 || index >= self->length) {
6592 PyErr_SetString(PyExc_IndexError, "string index out of range");
6593 return NULL;
6594 }
6595
6596 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6597}
6598
6599static long
6600unicode_hash(PyUnicodeObject *self)
6601{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006602 /* Since Unicode objects compare equal to their ASCII string
6603 counterparts, they should use the individual character values
6604 as basis for their hash value. This is needed to assure that
6605 strings and Unicode objects behave in the same way as
6606 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607
Martin v. Löwis18e16552006-02-15 17:27:45 +00006608 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006609 register Py_UNICODE *p;
6610 register long x;
6611
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006612#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006613 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006614#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006616 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006617 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006618 /*
6619 We make the hash of the empty string be 0, rather than using
6620 (prefix ^ suffix), since this slightly obfuscates the hash secret
6621 */
6622 if (len == 0) {
6623 self->hash = 0;
6624 return 0;
6625 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006626 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006627 x = _Py_HashSecret.prefix;
6628 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006629 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006630 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006631 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006632 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006633 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006634 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006635 self->hash = x;
6636 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637}
6638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006639PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006640 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006642Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643
6644static PyObject *
6645unicode_index(PyUnicodeObject *self, PyObject *args)
6646{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006647 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006648 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006649 Py_ssize_t start;
6650 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651
Jesus Cea44e81682011-04-20 16:39:15 +02006652 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6653 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006656 result = stringlib_find_slice(
6657 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6658 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6659 start, end
6660 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661
6662 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006663
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 if (result < 0) {
6665 PyErr_SetString(PyExc_ValueError, "substring not found");
6666 return NULL;
6667 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006668
Martin v. Löwis18e16552006-02-15 17:27:45 +00006669 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670}
6671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006672PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006673 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006675Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006676at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677
6678static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006679unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680{
6681 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6682 register const Py_UNICODE *e;
6683 int cased;
6684
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 /* Shortcut for single character strings */
6686 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006687 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006689 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006690 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006691 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006692
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693 e = p + PyUnicode_GET_SIZE(self);
6694 cased = 0;
6695 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006696 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006697
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006698 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6699 return PyBool_FromLong(0);
6700 else if (!cased && Py_UNICODE_ISLOWER(ch))
6701 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006703 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704}
6705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006706PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006707 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006709Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006710at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711
6712static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006713unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714{
6715 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6716 register const Py_UNICODE *e;
6717 int cased;
6718
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719 /* Shortcut for single character strings */
6720 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006721 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006723 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006724 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006725 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006726
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727 e = p + PyUnicode_GET_SIZE(self);
6728 cased = 0;
6729 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006730 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006731
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006732 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6733 return PyBool_FromLong(0);
6734 else if (!cased && Py_UNICODE_ISUPPER(ch))
6735 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006737 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738}
6739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006740PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006741 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006743Return True if S is a titlecased string and there is at least one\n\
6744character in S, i.e. upper- and titlecase characters may only\n\
6745follow uncased characters and lowercase characters only cased ones.\n\
6746Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747
6748static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006749unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750{
6751 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6752 register const Py_UNICODE *e;
6753 int cased, previous_is_cased;
6754
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 /* Shortcut for single character strings */
6756 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006757 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6758 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006760 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006761 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006762 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006763
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 e = p + PyUnicode_GET_SIZE(self);
6765 cased = 0;
6766 previous_is_cased = 0;
6767 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006768 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006769
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006770 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6771 if (previous_is_cased)
6772 return PyBool_FromLong(0);
6773 previous_is_cased = 1;
6774 cased = 1;
6775 }
6776 else if (Py_UNICODE_ISLOWER(ch)) {
6777 if (!previous_is_cased)
6778 return PyBool_FromLong(0);
6779 previous_is_cased = 1;
6780 cased = 1;
6781 }
6782 else
6783 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006785 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786}
6787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006788PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006789 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006791Return True if all characters in S are whitespace\n\
6792and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793
6794static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006795unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796{
6797 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6798 register const Py_UNICODE *e;
6799
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800 /* Shortcut for single character strings */
6801 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006802 Py_UNICODE_ISSPACE(*p))
6803 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006805 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006806 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006807 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006808
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 e = p + PyUnicode_GET_SIZE(self);
6810 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006811 if (!Py_UNICODE_ISSPACE(*p))
6812 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006814 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815}
6816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006817PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006818 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006819\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006820Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006821and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006822
6823static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006824unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006825{
6826 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6827 register const Py_UNICODE *e;
6828
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006829 /* Shortcut for single character strings */
6830 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006831 Py_UNICODE_ISALPHA(*p))
6832 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006833
6834 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006835 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006836 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006837
6838 e = p + PyUnicode_GET_SIZE(self);
6839 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006840 if (!Py_UNICODE_ISALPHA(*p))
6841 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006842 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006843 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006844}
6845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006846PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006847 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006848\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006849Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006850and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006851
6852static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006853unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006854{
6855 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6856 register const Py_UNICODE *e;
6857
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006858 /* Shortcut for single character strings */
6859 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006860 Py_UNICODE_ISALNUM(*p))
6861 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006862
6863 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006864 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006865 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006866
6867 e = p + PyUnicode_GET_SIZE(self);
6868 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006869 if (!Py_UNICODE_ISALNUM(*p))
6870 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006871 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006872 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006873}
6874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006875PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006876 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006878Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006879False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880
6881static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006882unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883{
6884 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6885 register const Py_UNICODE *e;
6886
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 /* Shortcut for single character strings */
6888 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006889 Py_UNICODE_ISDECIMAL(*p))
6890 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006892 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006893 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006894 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006895
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 e = p + PyUnicode_GET_SIZE(self);
6897 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006898 if (!Py_UNICODE_ISDECIMAL(*p))
6899 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006901 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902}
6903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006904PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006905 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006907Return True if all characters in S are digits\n\
6908and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909
6910static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006911unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912{
6913 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6914 register const Py_UNICODE *e;
6915
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 /* Shortcut for single character strings */
6917 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006918 Py_UNICODE_ISDIGIT(*p))
6919 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006921 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006922 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006923 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006924
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 e = p + PyUnicode_GET_SIZE(self);
6926 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006927 if (!Py_UNICODE_ISDIGIT(*p))
6928 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006930 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931}
6932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006933PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006934 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006937False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938
6939static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006940unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941{
6942 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6943 register const Py_UNICODE *e;
6944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 /* Shortcut for single character strings */
6946 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006947 Py_UNICODE_ISNUMERIC(*p))
6948 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006950 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006951 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006952 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006953
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 e = p + PyUnicode_GET_SIZE(self);
6955 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006956 if (!Py_UNICODE_ISNUMERIC(*p))
6957 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006959 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960}
6961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006962PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006963 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964\n\
6965Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006966iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967
6968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006969unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006971 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972}
6973
Martin v. Löwis18e16552006-02-15 17:27:45 +00006974static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975unicode_length(PyUnicodeObject *self)
6976{
6977 return self->length;
6978}
6979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006980PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006981 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006983Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006984done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985
6986static PyObject *
6987unicode_ljust(PyUnicodeObject *self, PyObject *args)
6988{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006989 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006990 Py_UNICODE fillchar = ' ';
6991
Martin v. Löwis412fb672006-04-13 06:34:32 +00006992 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 return NULL;
6994
Tim Peters7a29bd52001-09-12 03:03:31 +00006995 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 Py_INCREF(self);
6997 return (PyObject*) self;
6998 }
6999
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007000 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001}
7002
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007003PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007004 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007006Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007
7008static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007009unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 return fixup(self, fixlower);
7012}
7013
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007014#define LEFTSTRIP 0
7015#define RIGHTSTRIP 1
7016#define BOTHSTRIP 2
7017
7018/* Arrays indexed by above */
7019static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7020
7021#define STRIPNAME(i) (stripformat[i]+3)
7022
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007023/* externally visible for str.strip(unicode) */
7024PyObject *
7025_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7026{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007027 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7028 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7029 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7030 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7031 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007032
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007033 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007034
Benjamin Peterson857ce152009-01-31 16:29:18 +00007035 i = 0;
7036 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007037 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7038 i++;
7039 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007040 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007041
Benjamin Peterson857ce152009-01-31 16:29:18 +00007042 j = len;
7043 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007044 do {
7045 j--;
7046 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7047 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007048 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007049
Benjamin Peterson857ce152009-01-31 16:29:18 +00007050 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007051 Py_INCREF(self);
7052 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007053 }
7054 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007055 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007056}
7057
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058
7059static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007060do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007062 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7063 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007064
Benjamin Peterson857ce152009-01-31 16:29:18 +00007065 i = 0;
7066 if (striptype != RIGHTSTRIP) {
7067 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7068 i++;
7069 }
7070 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007071
Benjamin Peterson857ce152009-01-31 16:29:18 +00007072 j = len;
7073 if (striptype != LEFTSTRIP) {
7074 do {
7075 j--;
7076 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7077 j++;
7078 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007079
Benjamin Peterson857ce152009-01-31 16:29:18 +00007080 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7081 Py_INCREF(self);
7082 return (PyObject*)self;
7083 }
7084 else
7085 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086}
7087
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007088
7089static PyObject *
7090do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7091{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007092 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007093
Benjamin Peterson857ce152009-01-31 16:29:18 +00007094 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7095 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007096
Benjamin Peterson857ce152009-01-31 16:29:18 +00007097 if (sep != NULL && sep != Py_None) {
7098 if (PyUnicode_Check(sep))
7099 return _PyUnicode_XStrip(self, striptype, sep);
7100 else if (PyString_Check(sep)) {
7101 PyObject *res;
7102 sep = PyUnicode_FromObject(sep);
7103 if (sep==NULL)
7104 return NULL;
7105 res = _PyUnicode_XStrip(self, striptype, sep);
7106 Py_DECREF(sep);
7107 return res;
7108 }
7109 else {
7110 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007111 "%s arg must be None, unicode or str",
7112 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007113 return NULL;
7114 }
7115 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007116
Benjamin Peterson857ce152009-01-31 16:29:18 +00007117 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007118}
7119
7120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007121PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007122 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007123\n\
7124Return a copy of the string S with leading and trailing\n\
7125whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007126If chars is given and not None, remove characters in chars instead.\n\
7127If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007128
7129static PyObject *
7130unicode_strip(PyUnicodeObject *self, PyObject *args)
7131{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007132 if (PyTuple_GET_SIZE(args) == 0)
7133 return do_strip(self, BOTHSTRIP); /* Common case */
7134 else
7135 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007136}
7137
7138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007139PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007140 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007141\n\
7142Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007143If chars is given and not None, remove characters in chars instead.\n\
7144If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007145
7146static PyObject *
7147unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7148{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007149 if (PyTuple_GET_SIZE(args) == 0)
7150 return do_strip(self, LEFTSTRIP); /* Common case */
7151 else
7152 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007153}
7154
7155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007156PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007157 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007158\n\
7159Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007160If chars is given and not None, remove characters in chars instead.\n\
7161If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007162
7163static PyObject *
7164unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7165{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007166 if (PyTuple_GET_SIZE(args) == 0)
7167 return do_strip(self, RIGHTSTRIP); /* Common case */
7168 else
7169 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007170}
7171
7172
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007174unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175{
7176 PyUnicodeObject *u;
7177 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007178 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007179 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180
7181 if (len < 0)
7182 len = 0;
7183
Tim Peters7a29bd52001-09-12 03:03:31 +00007184 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 /* no repeat, return original string */
7186 Py_INCREF(str);
7187 return (PyObject*) str;
7188 }
Tim Peters8f422462000-09-09 06:13:41 +00007189
7190 /* ensure # of chars needed doesn't overflow int and # of bytes
7191 * needed doesn't overflow size_t
7192 */
7193 nchars = len * str->length;
7194 if (len && nchars / len != str->length) {
7195 PyErr_SetString(PyExc_OverflowError,
7196 "repeated string is too long");
7197 return NULL;
7198 }
7199 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7200 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7201 PyErr_SetString(PyExc_OverflowError,
7202 "repeated string is too long");
7203 return NULL;
7204 }
7205 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 if (!u)
7207 return NULL;
7208
7209 p = u->str;
7210
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007211 if (str->length == 1 && len > 0) {
7212 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007213 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007214 Py_ssize_t done = 0; /* number of characters copied this far */
7215 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007216 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007217 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007218 }
7219 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007220 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007221 Py_UNICODE_COPY(p+done, p, n);
7222 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007223 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007224 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225
7226 return (PyObject*) u;
7227}
7228
7229PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007230 PyObject *subobj,
7231 PyObject *replobj,
7232 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233{
7234 PyObject *self;
7235 PyObject *str1;
7236 PyObject *str2;
7237 PyObject *result;
7238
7239 self = PyUnicode_FromObject(obj);
7240 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 str1 = PyUnicode_FromObject(subobj);
7243 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007244 Py_DECREF(self);
7245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 }
7247 str2 = PyUnicode_FromObject(replobj);
7248 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007249 Py_DECREF(self);
7250 Py_DECREF(str1);
7251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 }
Tim Petersced69f82003-09-16 20:30:58 +00007253 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007254 (PyUnicodeObject *)str1,
7255 (PyUnicodeObject *)str2,
7256 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 Py_DECREF(self);
7258 Py_DECREF(str1);
7259 Py_DECREF(str2);
7260 return result;
7261}
7262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007263PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007264 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265\n\
7266Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007267old replaced by new. If the optional argument count is\n\
7268given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269
7270static PyObject*
7271unicode_replace(PyUnicodeObject *self, PyObject *args)
7272{
7273 PyUnicodeObject *str1;
7274 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007275 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 PyObject *result;
7277
Martin v. Löwis18e16552006-02-15 17:27:45 +00007278 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 return NULL;
7280 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7281 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007284 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007285 Py_DECREF(str1);
7286 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288
7289 result = replace(self, str1, str2, maxcount);
7290
7291 Py_DECREF(str1);
7292 Py_DECREF(str2);
7293 return result;
7294}
7295
7296static
7297PyObject *unicode_repr(PyObject *unicode)
7298{
7299 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007300 PyUnicode_GET_SIZE(unicode),
7301 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302}
7303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007304PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007305 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306\n\
7307Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007308such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309arguments start and end are interpreted as in slice notation.\n\
7310\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007311Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312
7313static PyObject *
7314unicode_rfind(PyUnicodeObject *self, PyObject *args)
7315{
Jesus Cea44e81682011-04-20 16:39:15 +02007316 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007317 Py_ssize_t start;
7318 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007319 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320
Jesus Cea44e81682011-04-20 16:39:15 +02007321 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7322 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007323 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007325 result = stringlib_rfind_slice(
7326 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7327 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7328 start, end
7329 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330
7331 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007332
7333 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334}
7335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007336PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007337 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007339Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340
7341static PyObject *
7342unicode_rindex(PyUnicodeObject *self, PyObject *args)
7343{
Jesus Cea44e81682011-04-20 16:39:15 +02007344 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007345 Py_ssize_t start;
7346 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007347 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348
Jesus Cea44e81682011-04-20 16:39:15 +02007349 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7350 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007353 result = stringlib_rfind_slice(
7354 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7355 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7356 start, end
7357 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358
7359 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007360
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 if (result < 0) {
7362 PyErr_SetString(PyExc_ValueError, "substring not found");
7363 return NULL;
7364 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007365 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366}
7367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007368PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007369 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007371Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007372done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
7374static PyObject *
7375unicode_rjust(PyUnicodeObject *self, PyObject *args)
7376{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007377 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007378 Py_UNICODE fillchar = ' ';
7379
Martin v. Löwis412fb672006-04-13 06:34:32 +00007380 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 return NULL;
7382
Tim Peters7a29bd52001-09-12 03:03:31 +00007383 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 Py_INCREF(self);
7385 return (PyObject*) self;
7386 }
7387
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007388 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389}
7390
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007392unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393{
7394 /* standard clamping */
7395 if (start < 0)
7396 start = 0;
7397 if (end < 0)
7398 end = 0;
7399 if (end > self->length)
7400 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007401 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 /* full slice, return original string */
7403 Py_INCREF(self);
7404 return (PyObject*) self;
7405 }
7406 if (start > end)
7407 start = end;
7408 /* copy slice */
7409 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007410 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411}
7412
7413PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007414 PyObject *sep,
7415 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416{
7417 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007418
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419 s = PyUnicode_FromObject(s);
7420 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007421 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007422 if (sep != NULL) {
7423 sep = PyUnicode_FromObject(sep);
7424 if (sep == NULL) {
7425 Py_DECREF(s);
7426 return NULL;
7427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428 }
7429
7430 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7431
7432 Py_DECREF(s);
7433 Py_XDECREF(sep);
7434 return result;
7435}
7436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007437PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007438 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439\n\
7440Return a list of the words in S, using sep as the\n\
7441delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007442splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007443whitespace string is a separator and empty strings are\n\
7444removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445
7446static PyObject*
7447unicode_split(PyUnicodeObject *self, PyObject *args)
7448{
7449 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007450 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451
Martin v. Löwis18e16552006-02-15 17:27:45 +00007452 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 return NULL;
7454
7455 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007456 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007458 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007460 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461}
7462
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007463PyObject *
7464PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7465{
7466 PyObject* str_obj;
7467 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007468 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007469
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007470 str_obj = PyUnicode_FromObject(str_in);
7471 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007472 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007473 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007474 if (!sep_obj) {
7475 Py_DECREF(str_obj);
7476 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007477 }
7478
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007479 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007480 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7481 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7482 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007483
Fredrik Lundhb9479482006-05-26 17:22:38 +00007484 Py_DECREF(sep_obj);
7485 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007486
7487 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007488}
7489
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007490
7491PyObject *
7492PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7493{
7494 PyObject* str_obj;
7495 PyObject* sep_obj;
7496 PyObject* out;
7497
7498 str_obj = PyUnicode_FromObject(str_in);
7499 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007500 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007501 sep_obj = PyUnicode_FromObject(sep_in);
7502 if (!sep_obj) {
7503 Py_DECREF(str_obj);
7504 return NULL;
7505 }
7506
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007507 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007508 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7509 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7510 );
7511
7512 Py_DECREF(sep_obj);
7513 Py_DECREF(str_obj);
7514
7515 return out;
7516}
7517
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007518PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007519 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007520\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007521Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007522the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007523found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007524
7525static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007526unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007527{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007528 return PyUnicode_Partition((PyObject *)self, separator);
7529}
7530
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007531PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007532 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007533\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007534Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007535the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007536separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007537
7538static PyObject*
7539unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7540{
7541 return PyUnicode_RPartition((PyObject *)self, separator);
7542}
7543
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007544PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007545 PyObject *sep,
7546 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007547{
7548 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007549
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007550 s = PyUnicode_FromObject(s);
7551 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007552 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007553 if (sep != NULL) {
7554 sep = PyUnicode_FromObject(sep);
7555 if (sep == NULL) {
7556 Py_DECREF(s);
7557 return NULL;
7558 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007559 }
7560
7561 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7562
7563 Py_DECREF(s);
7564 Py_XDECREF(sep);
7565 return result;
7566}
7567
7568PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007569 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007570\n\
7571Return a list of the words in S, using sep as the\n\
7572delimiter string, starting at the end of the string and\n\
7573working to the front. If maxsplit is given, at most maxsplit\n\
7574splits are done. If sep is not specified, any whitespace string\n\
7575is a separator.");
7576
7577static PyObject*
7578unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7579{
7580 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007581 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007582
Martin v. Löwis18e16552006-02-15 17:27:45 +00007583 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007584 return NULL;
7585
7586 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007587 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007588 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007589 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007590 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007591 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007592}
7593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007594PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007595 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596\n\
7597Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007598Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007599is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600
7601static PyObject*
7602unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7603{
Guido van Rossum86662912000-04-11 15:38:46 +00007604 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605
Guido van Rossum86662912000-04-11 15:38:46 +00007606 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 return NULL;
7608
Guido van Rossum86662912000-04-11 15:38:46 +00007609 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610}
7611
7612static
7613PyObject *unicode_str(PyUnicodeObject *self)
7614{
Fred Drakee4315f52000-05-09 19:53:39 +00007615 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616}
7617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007618PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007619 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620\n\
7621Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007622and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623
7624static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007625unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627 return fixup(self, fixswapcase);
7628}
7629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007630PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007631 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632\n\
7633Return a copy of the string S, where all characters have been mapped\n\
7634through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007635Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7636Unmapped characters are left untouched. Characters mapped to None\n\
7637are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638
7639static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007640unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641{
Tim Petersced69f82003-09-16 20:30:58 +00007642 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007643 self->length,
7644 table,
7645 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646}
7647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007648PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007649 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007651Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652
7653static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007654unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656 return fixup(self, fixupper);
7657}
7658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007659PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007660 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661\n\
Georg Brandl98064072008-09-09 19:26:00 +00007662Pad a numeric string S with zeros on the left, to fill a field\n\
7663of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664
7665static PyObject *
7666unicode_zfill(PyUnicodeObject *self, PyObject *args)
7667{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007668 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669 PyUnicodeObject *u;
7670
Martin v. Löwis18e16552006-02-15 17:27:45 +00007671 Py_ssize_t width;
7672 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673 return NULL;
7674
7675 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007676 if (PyUnicode_CheckExact(self)) {
7677 Py_INCREF(self);
7678 return (PyObject*) self;
7679 }
7680 else
7681 return PyUnicode_FromUnicode(
7682 PyUnicode_AS_UNICODE(self),
7683 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007684 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685 }
7686
7687 fill = width - self->length;
7688
7689 u = pad(self, fill, 0, '0');
7690
Walter Dörwald068325e2002-04-15 13:36:47 +00007691 if (u == NULL)
7692 return NULL;
7693
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 if (u->str[fill] == '+' || u->str[fill] == '-') {
7695 /* move sign to beginning of string */
7696 u->str[0] = u->str[fill];
7697 u->str[fill] = '0';
7698 }
7699
7700 return (PyObject*) u;
7701}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702
7703#if 0
7704static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007705free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007707 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708}
7709#endif
7710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007711PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007712 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007714Return True if S starts with the specified prefix, False otherwise.\n\
7715With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007716With optional end, stop comparing S at that position.\n\
7717prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
7719static PyObject *
7720unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007721 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722{
Georg Brandl24250812006-06-09 18:45:48 +00007723 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007725 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007726 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007727 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728
Jesus Cea44e81682011-04-20 16:39:15 +02007729 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007730 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007731 if (PyTuple_Check(subobj)) {
7732 Py_ssize_t i;
7733 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7734 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007735 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007736 if (substring == NULL)
7737 return NULL;
7738 result = tailmatch(self, substring, start, end, -1);
7739 Py_DECREF(substring);
7740 if (result) {
7741 Py_RETURN_TRUE;
7742 }
7743 }
7744 /* nothing matched */
7745 Py_RETURN_FALSE;
7746 }
7747 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007748 if (substring == NULL) {
7749 if (PyErr_ExceptionMatches(PyExc_TypeError))
7750 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7751 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007752 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007753 }
Georg Brandl24250812006-06-09 18:45:48 +00007754 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007756 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757}
7758
7759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007760PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007761 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007763Return True if S ends with the specified suffix, False otherwise.\n\
7764With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007765With optional end, stop comparing S at that position.\n\
7766suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767
7768static PyObject *
7769unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007770 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771{
Georg Brandl24250812006-06-09 18:45:48 +00007772 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007774 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007775 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007776 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777
Jesus Cea44e81682011-04-20 16:39:15 +02007778 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007779 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007780 if (PyTuple_Check(subobj)) {
7781 Py_ssize_t i;
7782 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7783 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007784 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007785 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007786 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007787 result = tailmatch(self, substring, start, end, +1);
7788 Py_DECREF(substring);
7789 if (result) {
7790 Py_RETURN_TRUE;
7791 }
7792 }
7793 Py_RETURN_FALSE;
7794 }
7795 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007796 if (substring == NULL) {
7797 if (PyErr_ExceptionMatches(PyExc_TypeError))
7798 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7799 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007800 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007801 }
Georg Brandl24250812006-06-09 18:45:48 +00007802 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007804 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805}
7806
7807
Eric Smitha9f7d622008-02-17 19:46:49 +00007808/* Implements do_string_format, which is unicode because of stringlib */
7809#include "stringlib/string_format.h"
7810
7811PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007812 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007813\n\
Eric Smith6c840852010-11-06 19:43:44 +00007814Return a formatted version of S, using substitutions from args and kwargs.\n\
7815The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007816
Eric Smithdc13b792008-05-30 18:10:04 +00007817static PyObject *
7818unicode__format__(PyObject *self, PyObject *args)
7819{
7820 PyObject *format_spec;
7821 PyObject *result = NULL;
7822 PyObject *tmp = NULL;
7823
7824 /* If 2.x, convert format_spec to the same type as value */
7825 /* This is to allow things like u''.format('') */
7826 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7827 goto done;
7828 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7829 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007830 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007831 goto done;
7832 }
7833 tmp = PyObject_Unicode(format_spec);
7834 if (tmp == NULL)
7835 goto done;
7836 format_spec = tmp;
7837
7838 result = _PyUnicode_FormatAdvanced(self,
7839 PyUnicode_AS_UNICODE(format_spec),
7840 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007841 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007842 Py_XDECREF(tmp);
7843 return result;
7844}
7845
Eric Smitha9f7d622008-02-17 19:46:49 +00007846PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007847 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007848\n\
Eric Smith6c840852010-11-06 19:43:44 +00007849Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007850
Robert Schuppenies901c9972008-06-10 10:10:31 +00007851static PyObject *
7852unicode__sizeof__(PyUnicodeObject *v)
7853{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007854 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7855 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007856}
7857
7858PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007859 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007860\n\
7861");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007862
7863static PyObject *
7864unicode_getnewargs(PyUnicodeObject *v)
7865{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007866 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007867}
7868
7869
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007871 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007872 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7873 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007874 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007875 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7876 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7877 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7878 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7879 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7880 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7881 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007882 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007883 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7884 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7885 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007886 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007887 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007888/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7889 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7890 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7891 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007892 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007893 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007894 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007895 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007896 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7897 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7898 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7899 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7900 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7901 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7902 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7903 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7904 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7905 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7906 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7907 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7908 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7909 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007910 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007911 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7912 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7913 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7914 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007915 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007916#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007917 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918#endif
7919
7920#if 0
7921 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007922 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923#endif
7924
Benjamin Peterson857ce152009-01-31 16:29:18 +00007925 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926 {NULL, NULL}
7927};
7928
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007929static PyObject *
7930unicode_mod(PyObject *v, PyObject *w)
7931{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007932 if (!PyUnicode_Check(v)) {
7933 Py_INCREF(Py_NotImplemented);
7934 return Py_NotImplemented;
7935 }
7936 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007937}
7938
7939static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007940 0, /*nb_add*/
7941 0, /*nb_subtract*/
7942 0, /*nb_multiply*/
7943 0, /*nb_divide*/
7944 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007945};
7946
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007948 (lenfunc) unicode_length, /* sq_length */
7949 PyUnicode_Concat, /* sq_concat */
7950 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7951 (ssizeargfunc) unicode_getitem, /* sq_item */
7952 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7953 0, /* sq_ass_item */
7954 0, /* sq_ass_slice */
7955 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956};
7957
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007958static PyObject*
7959unicode_subscript(PyUnicodeObject* self, PyObject* item)
7960{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007961 if (PyIndex_Check(item)) {
7962 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007963 if (i == -1 && PyErr_Occurred())
7964 return NULL;
7965 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007966 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007967 return unicode_getitem(self, i);
7968 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007969 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007970 Py_UNICODE* source_buf;
7971 Py_UNICODE* result_buf;
7972 PyObject* result;
7973
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007974 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007975 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007976 return NULL;
7977 }
7978
7979 if (slicelength <= 0) {
7980 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007981 } else if (start == 0 && step == 1 && slicelength == self->length &&
7982 PyUnicode_CheckExact(self)) {
7983 Py_INCREF(self);
7984 return (PyObject *)self;
7985 } else if (step == 1) {
7986 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007987 } else {
7988 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007989 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7990 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007991
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007992 if (result_buf == NULL)
7993 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007994
7995 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7996 result_buf[i] = source_buf[cur];
7997 }
Tim Petersced69f82003-09-16 20:30:58 +00007998
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007999 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008000 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008001 return result;
8002 }
8003 } else {
8004 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8005 return NULL;
8006 }
8007}
8008
8009static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008010 (lenfunc)unicode_length, /* mp_length */
8011 (binaryfunc)unicode_subscript, /* mp_subscript */
8012 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008013};
8014
Martin v. Löwis18e16552006-02-15 17:27:45 +00008015static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008017 Py_ssize_t index,
8018 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019{
8020 if (index != 0) {
8021 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008022 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 return -1;
8024 }
8025 *ptr = (void *) self->str;
8026 return PyUnicode_GET_DATA_SIZE(self);
8027}
8028
Martin v. Löwis18e16552006-02-15 17:27:45 +00008029static Py_ssize_t
8030unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008031 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032{
8033 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008034 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 return -1;
8036}
8037
8038static int
8039unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008040 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041{
8042 if (lenp)
8043 *lenp = PyUnicode_GET_DATA_SIZE(self);
8044 return 1;
8045}
8046
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008047static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008049 Py_ssize_t index,
8050 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051{
8052 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008053
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 if (index != 0) {
8055 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008056 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 return -1;
8058 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008059 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008061 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008062 *ptr = (void *) PyString_AS_STRING(str);
8063 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064}
8065
8066/* Helpers for PyUnicode_Format() */
8067
8068static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008069getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008071 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008073 (*p_argidx)++;
8074 if (arglen < 0)
8075 return args;
8076 else
8077 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078 }
8079 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008080 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 return NULL;
8082}
8083
8084#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008085#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008087#define F_ALT (1<<3)
8088#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089
Martin v. Löwis18e16552006-02-15 17:27:45 +00008090static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008091strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008093 register Py_ssize_t i;
8094 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008096 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098 return len;
8099}
8100
Neal Norwitzfc76d632006-01-10 06:03:13 +00008101static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008102longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8103{
Tim Peters15231542006-02-16 01:08:01 +00008104 Py_ssize_t result;
8105
Neal Norwitzfc76d632006-01-10 06:03:13 +00008106 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008107 result = strtounicode(buffer, (char *)buffer);
8108 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008109}
8110
Guido van Rossum078151d2002-08-11 04:24:12 +00008111/* XXX To save some code duplication, formatfloat/long/int could have been
8112 shared with stringobject.c, converting from 8-bit to Unicode after the
8113 formatting is done. */
8114
Mark Dickinson18cfada2009-11-23 18:46:41 +00008115/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8116
8117static PyObject *
8118formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008120 char *p;
8121 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008123
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 x = PyFloat_AsDouble(v);
8125 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008126 return NULL;
8127
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008129 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008130
Mark Dickinson18cfada2009-11-23 18:46:41 +00008131 p = PyOS_double_to_string(x, type, prec,
8132 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8133 if (p == NULL)
8134 return NULL;
8135 result = PyUnicode_FromStringAndSize(p, strlen(p));
8136 PyMem_Free(p);
8137 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138}
8139
Tim Peters38fd5b62000-09-21 05:43:11 +00008140static PyObject*
8141formatlong(PyObject *val, int flags, int prec, int type)
8142{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008143 char *buf;
8144 int i, len;
8145 PyObject *str; /* temporary string object. */
8146 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008147
Benjamin Peterson857ce152009-01-31 16:29:18 +00008148 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8149 if (!str)
8150 return NULL;
8151 result = _PyUnicode_New(len);
8152 if (!result) {
8153 Py_DECREF(str);
8154 return NULL;
8155 }
8156 for (i = 0; i < len; i++)
8157 result->str[i] = buf[i];
8158 result->str[len] = 0;
8159 Py_DECREF(str);
8160 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008161}
8162
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163static int
8164formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008165 size_t buflen,
8166 int flags,
8167 int prec,
8168 int type,
8169 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008171 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008172 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8173 * + 1 + 1
8174 * = 24
8175 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008176 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008177 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178 long x;
8179
8180 x = PyInt_AsLong(v);
8181 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008182 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008183 if (x < 0 && type == 'u') {
8184 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008185 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008186 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8187 sign = "-";
8188 else
8189 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008191 prec = 1;
8192
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008193 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8194 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008195 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008196 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008197 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008198 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008199 return -1;
8200 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008201
8202 if ((flags & F_ALT) &&
8203 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008204 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008205 * of issues that cause pain:
8206 * - when 0 is being converted, the C standard leaves off
8207 * the '0x' or '0X', which is inconsistent with other
8208 * %#x/%#X conversions and inconsistent with Python's
8209 * hex() function
8210 * - there are platforms that violate the standard and
8211 * convert 0 with the '0x' or '0X'
8212 * (Metrowerks, Compaq Tru64)
8213 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008214 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008215 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008216 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008217 * We can achieve the desired consistency by inserting our
8218 * own '0x' or '0X' prefix, and substituting %x/%X in place
8219 * of %#x/%#X.
8220 *
8221 * Note that this is the same approach as used in
8222 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008223 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008224 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8225 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008226 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008227 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008228 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8229 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008230 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008231 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008232 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008233 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008234 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008235 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236}
8237
8238static int
8239formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008240 size_t buflen,
8241 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242{
Ezio Melotti32125152010-02-25 17:36:04 +00008243 PyObject *unistr;
8244 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008245 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008246 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008247 if (PyUnicode_GET_SIZE(v) != 1)
8248 goto onError;
8249 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008252 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008253 if (PyString_GET_SIZE(v) != 1)
8254 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008255 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8256 with a UnicodeDecodeError if 'char' is not decodable with the
8257 default encoding (usually ASCII, but it might be something else) */
8258 str = PyString_AS_STRING(v);
8259 if ((unsigned char)str[0] > 0x7F) {
8260 /* the char is not ASCII; try to decode the string using the
8261 default encoding and return -1 to let the UnicodeDecodeError
8262 be raised if the string can't be decoded */
8263 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8264 if (unistr == NULL)
8265 return -1;
8266 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8267 Py_DECREF(unistr);
8268 }
8269 else
8270 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272
8273 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008274 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008276 x = PyInt_AsLong(v);
8277 if (x == -1 && PyErr_Occurred())
8278 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008279#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008280 if (x < 0 || x > 0x10ffff) {
8281 PyErr_SetString(PyExc_OverflowError,
8282 "%c arg not in range(0x110000) "
8283 "(wide Python build)");
8284 return -1;
8285 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008286#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008287 if (x < 0 || x > 0xffff) {
8288 PyErr_SetString(PyExc_OverflowError,
8289 "%c arg not in range(0x10000) "
8290 "(narrow Python build)");
8291 return -1;
8292 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008293#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008294 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 }
8296 buf[1] = '\0';
8297 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008298
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008299 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008300 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008301 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008302 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303}
8304
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008305/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8306
Mark Dickinson18cfada2009-11-23 18:46:41 +00008307 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008308 chars are formatted. XXX This is a magic number. Each formatting
8309 routine does bounds checking to ensure no overflow, but a better
8310 solution may be to malloc a buffer of appropriate size for each
8311 format. For now, the current solution is sufficient.
8312*/
8313#define FORMATBUFLEN (size_t)120
8314
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008316 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317{
8318 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008319 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320 int args_owned = 0;
8321 PyUnicodeObject *result = NULL;
8322 PyObject *dict = NULL;
8323 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008324
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008326 PyErr_BadInternalCall();
8327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 }
8329 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008330 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008331 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332 fmt = PyUnicode_AS_UNICODE(uformat);
8333 fmtcnt = PyUnicode_GET_SIZE(uformat);
8334
8335 reslen = rescnt = fmtcnt + 100;
8336 result = _PyUnicode_New(reslen);
8337 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008338 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339 res = PyUnicode_AS_UNICODE(result);
8340
8341 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008342 arglen = PyTuple_Size(args);
8343 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344 }
8345 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008346 arglen = -1;
8347 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 }
Benjamin Petersonda2c7eb2013-03-23 22:32:00 -05008349 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8350 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008351 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352
8353 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008354 if (*fmt != '%') {
8355 if (--rescnt < 0) {
8356 rescnt = fmtcnt + 100;
8357 reslen += rescnt;
8358 if (_PyUnicode_Resize(&result, reslen) < 0)
8359 goto onError;
8360 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8361 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008362 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008363 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008364 }
8365 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008366 /* Got a format specifier */
8367 int flags = 0;
8368 Py_ssize_t width = -1;
8369 int prec = -1;
8370 Py_UNICODE c = '\0';
8371 Py_UNICODE fill;
8372 int isnumok;
8373 PyObject *v = NULL;
8374 PyObject *temp = NULL;
8375 Py_UNICODE *pbuf;
8376 Py_UNICODE sign;
8377 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008378 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008379
8380 fmt++;
8381 if (*fmt == '(') {
8382 Py_UNICODE *keystart;
8383 Py_ssize_t keylen;
8384 PyObject *key;
8385 int pcount = 1;
8386
8387 if (dict == NULL) {
8388 PyErr_SetString(PyExc_TypeError,
8389 "format requires a mapping");
8390 goto onError;
8391 }
8392 ++fmt;
8393 --fmtcnt;
8394 keystart = fmt;
8395 /* Skip over balanced parentheses */
8396 while (pcount > 0 && --fmtcnt >= 0) {
8397 if (*fmt == ')')
8398 --pcount;
8399 else if (*fmt == '(')
8400 ++pcount;
8401 fmt++;
8402 }
8403 keylen = fmt - keystart - 1;
8404 if (fmtcnt < 0 || pcount > 0) {
8405 PyErr_SetString(PyExc_ValueError,
8406 "incomplete format key");
8407 goto onError;
8408 }
8409#if 0
8410 /* keys are converted to strings using UTF-8 and
8411 then looked up since Python uses strings to hold
8412 variables names etc. in its namespaces and we
8413 wouldn't want to break common idioms. */
8414 key = PyUnicode_EncodeUTF8(keystart,
8415 keylen,
8416 NULL);
8417#else
8418 key = PyUnicode_FromUnicode(keystart, keylen);
8419#endif
8420 if (key == NULL)
8421 goto onError;
8422 if (args_owned) {
8423 Py_DECREF(args);
8424 args_owned = 0;
8425 }
8426 args = PyObject_GetItem(dict, key);
8427 Py_DECREF(key);
8428 if (args == NULL) {
8429 goto onError;
8430 }
8431 args_owned = 1;
8432 arglen = -1;
8433 argidx = -2;
8434 }
8435 while (--fmtcnt >= 0) {
8436 switch (c = *fmt++) {
8437 case '-': flags |= F_LJUST; continue;
8438 case '+': flags |= F_SIGN; continue;
8439 case ' ': flags |= F_BLANK; continue;
8440 case '#': flags |= F_ALT; continue;
8441 case '0': flags |= F_ZERO; continue;
8442 }
8443 break;
8444 }
8445 if (c == '*') {
8446 v = getnextarg(args, arglen, &argidx);
8447 if (v == NULL)
8448 goto onError;
8449 if (!PyInt_Check(v)) {
8450 PyErr_SetString(PyExc_TypeError,
8451 "* wants int");
8452 goto onError;
8453 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008454 width = PyInt_AsSsize_t(v);
8455 if (width == -1 && PyErr_Occurred())
8456 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008457 if (width < 0) {
8458 flags |= F_LJUST;
8459 width = -width;
8460 }
8461 if (--fmtcnt >= 0)
8462 c = *fmt++;
8463 }
8464 else if (c >= '0' && c <= '9') {
8465 width = c - '0';
8466 while (--fmtcnt >= 0) {
8467 c = *fmt++;
8468 if (c < '0' || c > '9')
8469 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008470 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008471 PyErr_SetString(PyExc_ValueError,
8472 "width too big");
8473 goto onError;
8474 }
8475 width = width*10 + (c - '0');
8476 }
8477 }
8478 if (c == '.') {
8479 prec = 0;
8480 if (--fmtcnt >= 0)
8481 c = *fmt++;
8482 if (c == '*') {
8483 v = getnextarg(args, arglen, &argidx);
8484 if (v == NULL)
8485 goto onError;
8486 if (!PyInt_Check(v)) {
8487 PyErr_SetString(PyExc_TypeError,
8488 "* wants int");
8489 goto onError;
8490 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008491 prec = _PyInt_AsInt(v);
8492 if (prec == -1 && PyErr_Occurred())
8493 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008494 if (prec < 0)
8495 prec = 0;
8496 if (--fmtcnt >= 0)
8497 c = *fmt++;
8498 }
8499 else if (c >= '0' && c <= '9') {
8500 prec = c - '0';
8501 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008502 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008503 if (c < '0' || c > '9')
8504 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008505 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008506 PyErr_SetString(PyExc_ValueError,
8507 "prec too big");
8508 goto onError;
8509 }
8510 prec = prec*10 + (c - '0');
8511 }
8512 }
8513 } /* prec */
8514 if (fmtcnt >= 0) {
8515 if (c == 'h' || c == 'l' || c == 'L') {
8516 if (--fmtcnt >= 0)
8517 c = *fmt++;
8518 }
8519 }
8520 if (fmtcnt < 0) {
8521 PyErr_SetString(PyExc_ValueError,
8522 "incomplete format");
8523 goto onError;
8524 }
8525 if (c != '%') {
8526 v = getnextarg(args, arglen, &argidx);
8527 if (v == NULL)
8528 goto onError;
8529 }
8530 sign = 0;
8531 fill = ' ';
8532 switch (c) {
8533
8534 case '%':
8535 pbuf = formatbuf;
8536 /* presume that buffer length is at least 1 */
8537 pbuf[0] = '%';
8538 len = 1;
8539 break;
8540
8541 case 's':
8542 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008543 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008544 temp = v;
8545 Py_INCREF(temp);
8546 }
8547 else {
8548 PyObject *unicode;
8549 if (c == 's')
8550 temp = PyObject_Unicode(v);
8551 else
8552 temp = PyObject_Repr(v);
8553 if (temp == NULL)
8554 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008555 if (PyUnicode_Check(temp))
8556 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008557 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008558 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008559 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8560 PyString_GET_SIZE(temp),
8561 NULL,
8562 "strict");
8563 Py_DECREF(temp);
8564 temp = unicode;
8565 if (temp == NULL)
8566 goto onError;
8567 }
8568 else {
8569 Py_DECREF(temp);
8570 PyErr_SetString(PyExc_TypeError,
8571 "%s argument has non-string str()");
8572 goto onError;
8573 }
8574 }
8575 pbuf = PyUnicode_AS_UNICODE(temp);
8576 len = PyUnicode_GET_SIZE(temp);
8577 if (prec >= 0 && len > prec)
8578 len = prec;
8579 break;
8580
8581 case 'i':
8582 case 'd':
8583 case 'u':
8584 case 'o':
8585 case 'x':
8586 case 'X':
8587 if (c == 'i')
8588 c = 'd';
8589 isnumok = 0;
8590 if (PyNumber_Check(v)) {
8591 PyObject *iobj=NULL;
8592
8593 if (PyInt_Check(v) || (PyLong_Check(v))) {
8594 iobj = v;
8595 Py_INCREF(iobj);
8596 }
8597 else {
8598 iobj = PyNumber_Int(v);
8599 if (iobj==NULL) iobj = PyNumber_Long(v);
8600 }
8601 if (iobj!=NULL) {
8602 if (PyInt_Check(iobj)) {
8603 isnumok = 1;
8604 pbuf = formatbuf;
8605 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8606 flags, prec, c, iobj);
8607 Py_DECREF(iobj);
8608 if (len < 0)
8609 goto onError;
8610 sign = 1;
8611 }
8612 else if (PyLong_Check(iobj)) {
8613 isnumok = 1;
8614 temp = formatlong(iobj, flags, prec, c);
8615 Py_DECREF(iobj);
8616 if (!temp)
8617 goto onError;
8618 pbuf = PyUnicode_AS_UNICODE(temp);
8619 len = PyUnicode_GET_SIZE(temp);
8620 sign = 1;
8621 }
8622 else {
8623 Py_DECREF(iobj);
8624 }
8625 }
8626 }
8627 if (!isnumok) {
8628 PyErr_Format(PyExc_TypeError,
8629 "%%%c format: a number is required, "
8630 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8631 goto onError;
8632 }
8633 if (flags & F_ZERO)
8634 fill = '0';
8635 break;
8636
8637 case 'e':
8638 case 'E':
8639 case 'f':
8640 case 'F':
8641 case 'g':
8642 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008643 temp = formatfloat(v, flags, prec, c);
8644 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008645 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008646 pbuf = PyUnicode_AS_UNICODE(temp);
8647 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008648 sign = 1;
8649 if (flags & F_ZERO)
8650 fill = '0';
8651 break;
8652
8653 case 'c':
8654 pbuf = formatbuf;
8655 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8656 if (len < 0)
8657 goto onError;
8658 break;
8659
8660 default:
8661 PyErr_Format(PyExc_ValueError,
8662 "unsupported format character '%c' (0x%x) "
8663 "at index %zd",
8664 (31<=c && c<=126) ? (char)c : '?',
8665 (int)c,
8666 (Py_ssize_t)(fmt - 1 -
8667 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008668 goto onError;
8669 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008670 if (sign) {
8671 if (*pbuf == '-' || *pbuf == '+') {
8672 sign = *pbuf++;
8673 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008674 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008675 else if (flags & F_SIGN)
8676 sign = '+';
8677 else if (flags & F_BLANK)
8678 sign = ' ';
8679 else
8680 sign = 0;
8681 }
8682 if (width < len)
8683 width = len;
8684 if (rescnt - (sign != 0) < width) {
8685 reslen -= rescnt;
8686 rescnt = width + fmtcnt + 100;
8687 reslen += rescnt;
8688 if (reslen < 0) {
8689 Py_XDECREF(temp);
8690 PyErr_NoMemory();
8691 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008692 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008693 if (_PyUnicode_Resize(&result, reslen) < 0) {
8694 Py_XDECREF(temp);
8695 goto onError;
8696 }
8697 res = PyUnicode_AS_UNICODE(result)
8698 + reslen - rescnt;
8699 }
8700 if (sign) {
8701 if (fill != ' ')
8702 *res++ = sign;
8703 rescnt--;
8704 if (width > len)
8705 width--;
8706 }
8707 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8708 assert(pbuf[0] == '0');
8709 assert(pbuf[1] == c);
8710 if (fill != ' ') {
8711 *res++ = *pbuf++;
8712 *res++ = *pbuf++;
8713 }
8714 rescnt -= 2;
8715 width -= 2;
8716 if (width < 0)
8717 width = 0;
8718 len -= 2;
8719 }
8720 if (width > len && !(flags & F_LJUST)) {
8721 do {
8722 --rescnt;
8723 *res++ = fill;
8724 } while (--width > len);
8725 }
8726 if (fill == ' ') {
8727 if (sign)
8728 *res++ = sign;
8729 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8730 assert(pbuf[0] == '0');
8731 assert(pbuf[1] == c);
8732 *res++ = *pbuf++;
8733 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008734 }
8735 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008736 Py_UNICODE_COPY(res, pbuf, len);
8737 res += len;
8738 rescnt -= len;
8739 while (--width >= len) {
8740 --rescnt;
8741 *res++ = ' ';
8742 }
8743 if (dict && (argidx < arglen) && c != '%') {
8744 PyErr_SetString(PyExc_TypeError,
8745 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008746 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008747 goto onError;
8748 }
8749 Py_XDECREF(temp);
8750 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751 } /* until end */
8752 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008753 PyErr_SetString(PyExc_TypeError,
8754 "not all arguments converted during string formatting");
8755 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756 }
8757
Thomas Woutersa96affe2006-03-12 00:29:36 +00008758 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008759 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008761 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762 }
8763 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 return (PyObject *)result;
8765
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008766 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767 Py_XDECREF(result);
8768 Py_DECREF(uformat);
8769 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008770 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771 }
8772 return NULL;
8773}
8774
8775static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008776 (readbufferproc) unicode_buffer_getreadbuf,
8777 (writebufferproc) unicode_buffer_getwritebuf,
8778 (segcountproc) unicode_buffer_getsegcount,
8779 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780};
8781
Jeremy Hylton938ace62002-07-17 16:30:39 +00008782static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008783unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8784
Tim Peters6d6c1a32001-08-02 04:15:00 +00008785static PyObject *
8786unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8787{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008788 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008789 static char *kwlist[] = {"string", "encoding", "errors", 0};
8790 char *encoding = NULL;
8791 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008792
Benjamin Peterson857ce152009-01-31 16:29:18 +00008793 if (type != &PyUnicode_Type)
8794 return unicode_subtype_new(type, args, kwds);
8795 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008796 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008797 return NULL;
8798 if (x == NULL)
8799 return (PyObject *)_PyUnicode_New(0);
8800 if (encoding == NULL && errors == NULL)
8801 return PyObject_Unicode(x);
8802 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008803 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008804}
8805
Guido van Rossume023fe02001-08-30 03:12:59 +00008806static PyObject *
8807unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8808{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008809 PyUnicodeObject *tmp, *pnew;
8810 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008811
Benjamin Peterson857ce152009-01-31 16:29:18 +00008812 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8813 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8814 if (tmp == NULL)
8815 return NULL;
8816 assert(PyUnicode_Check(tmp));
8817 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8818 if (pnew == NULL) {
8819 Py_DECREF(tmp);
8820 return NULL;
8821 }
8822 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8823 if (pnew->str == NULL) {
8824 _Py_ForgetReference((PyObject *)pnew);
8825 PyObject_Del(pnew);
8826 Py_DECREF(tmp);
8827 return PyErr_NoMemory();
8828 }
8829 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8830 pnew->length = n;
8831 pnew->hash = tmp->hash;
8832 Py_DECREF(tmp);
8833 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008834}
8835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008836PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008837 "unicode(object='') -> unicode object\n\
8838unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008839\n\
8840Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008841encoding defaults to the current default string encoding.\n\
8842errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008843
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008845 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008846 "unicode", /* tp_name */
8847 sizeof(PyUnicodeObject), /* tp_size */
8848 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008850 (destructor)unicode_dealloc, /* tp_dealloc */
8851 0, /* tp_print */
8852 0, /* tp_getattr */
8853 0, /* tp_setattr */
8854 0, /* tp_compare */
8855 unicode_repr, /* tp_repr */
8856 &unicode_as_number, /* tp_as_number */
8857 &unicode_as_sequence, /* tp_as_sequence */
8858 &unicode_as_mapping, /* tp_as_mapping */
8859 (hashfunc) unicode_hash, /* tp_hash*/
8860 0, /* tp_call*/
8861 (reprfunc) unicode_str, /* tp_str */
8862 PyObject_GenericGetAttr, /* tp_getattro */
8863 0, /* tp_setattro */
8864 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008865 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008866 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008867 unicode_doc, /* tp_doc */
8868 0, /* tp_traverse */
8869 0, /* tp_clear */
8870 PyUnicode_RichCompare, /* tp_richcompare */
8871 0, /* tp_weaklistoffset */
8872 0, /* tp_iter */
8873 0, /* tp_iternext */
8874 unicode_methods, /* tp_methods */
8875 0, /* tp_members */
8876 0, /* tp_getset */
8877 &PyBaseString_Type, /* tp_base */
8878 0, /* tp_dict */
8879 0, /* tp_descr_get */
8880 0, /* tp_descr_set */
8881 0, /* tp_dictoffset */
8882 0, /* tp_init */
8883 0, /* tp_alloc */
8884 unicode_new, /* tp_new */
8885 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886};
8887
8888/* Initialize the Unicode implementation */
8889
Thomas Wouters78890102000-07-22 19:25:51 +00008890void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008892 /* XXX - move this array to unicodectype.c ? */
8893 Py_UNICODE linebreak[] = {
8894 0x000A, /* LINE FEED */
8895 0x000D, /* CARRIAGE RETURN */
8896 0x001C, /* FILE SEPARATOR */
8897 0x001D, /* GROUP SEPARATOR */
8898 0x001E, /* RECORD SEPARATOR */
8899 0x0085, /* NEXT LINE */
8900 0x2028, /* LINE SEPARATOR */
8901 0x2029, /* PARAGRAPH SEPARATOR */
8902 };
8903
Fred Drakee4315f52000-05-09 19:53:39 +00008904 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008905 if (!unicode_empty) {
8906 unicode_empty = _PyUnicode_New(0);
8907 if (!unicode_empty)
8908 return;
8909 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008910
Guido van Rossumcacfc072002-05-24 19:01:59 +00008911 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008912 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008913
8914 /* initialize the linebreak bloom filter */
8915 bloom_linebreak = make_bloom_mask(
8916 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8917 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008918
8919 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008920
8921 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8922 Py_FatalError("Can't initialize field name iterator type");
8923
8924 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8925 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926}
8927
8928/* Finalize the Unicode implementation */
8929
Christian Heimes3b718a72008-02-14 12:47:33 +00008930int
8931PyUnicode_ClearFreeList(void)
8932{
8933 int freelist_size = numfree;
8934 PyUnicodeObject *u;
8935
8936 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008937 PyUnicodeObject *v = u;
8938 u = *(PyUnicodeObject **)u;
8939 if (v->str)
8940 PyObject_DEL(v->str);
8941 Py_XDECREF(v->defenc);
8942 PyObject_Del(v);
8943 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008944 }
8945 free_list = NULL;
8946 assert(numfree == 0);
8947 return freelist_size;
8948}
8949
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950void
Thomas Wouters78890102000-07-22 19:25:51 +00008951_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008953 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008955 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008956
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008957 for (i = 0; i < 256; i++)
8958 Py_CLEAR(unicode_latin1[i]);
8959
Christian Heimes3b718a72008-02-14 12:47:33 +00008960 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008962
Anthony Baxterac6bd462006-04-13 02:06:09 +00008963#ifdef __cplusplus
8964}
8965#endif