blob: 981a98b63f23c856f928f03bdbfd3123c913b479 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550#ifdef HAVE_WCHAR_H
551
Mark Dickinson6b265f12009-03-18 16:07:26 +0000552#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
553# define CONVERT_WCHAR_TO_SURROGATES
554#endif
555
556#ifdef CONVERT_WCHAR_TO_SURROGATES
557
558/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
559 to convert from UTF32 to UTF16. */
560
561PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
562 Py_ssize_t size)
563{
564 PyUnicodeObject *unicode;
565 register Py_ssize_t i;
566 Py_ssize_t alloc;
567 const wchar_t *orig_w;
568
569 if (w == NULL) {
570 PyErr_BadInternalCall();
571 return NULL;
572 }
573
574 alloc = size;
575 orig_w = w;
576 for (i = size; i > 0; i--) {
577 if (*w > 0xFFFF)
578 alloc++;
579 w++;
580 }
581 w = orig_w;
582 unicode = _PyUnicode_New(alloc);
583 if (!unicode)
584 return NULL;
585
586 /* Copy the wchar_t data into the new object */
587 {
588 register Py_UNICODE *u;
589 u = PyUnicode_AS_UNICODE(unicode);
590 for (i = size; i > 0; i--) {
591 if (*w > 0xFFFF) {
592 wchar_t ordinal = *w++;
593 ordinal -= 0x10000;
594 *u++ = 0xD800 | (ordinal >> 10);
595 *u++ = 0xDC00 | (ordinal & 0x3FF);
596 }
597 else
598 *u++ = *w++;
599 }
600 }
601 return (PyObject *)unicode;
602}
603
604#else
605
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000607 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608{
609 PyUnicodeObject *unicode;
610
611 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000612 PyErr_BadInternalCall();
613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 }
615
616 unicode = _PyUnicode_New(size);
617 if (!unicode)
618 return NULL;
619
620 /* Copy the wchar_t data into the new object */
621#ifdef HAVE_USABLE_WCHAR_T
622 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000623#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000625 register Py_UNICODE *u;
626 register Py_ssize_t i;
627 u = PyUnicode_AS_UNICODE(unicode);
628 for (i = size; i > 0; i--)
629 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000630 }
631#endif
632
633 return (PyObject *)unicode;
634}
635
Mark Dickinson6b265f12009-03-18 16:07:26 +0000636#endif /* CONVERT_WCHAR_TO_SURROGATES */
637
638#undef CONVERT_WCHAR_TO_SURROGATES
639
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000640static void
641makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
642{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000643 *fmt++ = '%';
644 if (width) {
645 if (zeropad)
646 *fmt++ = '0';
647 fmt += sprintf(fmt, "%d", width);
648 }
649 if (precision)
650 fmt += sprintf(fmt, ".%d", precision);
651 if (longflag)
652 *fmt++ = 'l';
653 else if (size_tflag) {
654 char *f = PY_FORMAT_SIZE_T;
655 while (*f)
656 *fmt++ = *f++;
657 }
658 *fmt++ = c;
659 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000660}
661
662#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
663
664PyObject *
665PyUnicode_FromFormatV(const char *format, va_list vargs)
666{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000667 va_list count;
668 Py_ssize_t callcount = 0;
669 PyObject **callresults = NULL;
670 PyObject **callresult = NULL;
671 Py_ssize_t n = 0;
672 int width = 0;
673 int precision = 0;
674 int zeropad;
675 const char* f;
676 Py_UNICODE *s;
677 PyObject *string;
678 /* used by sprintf */
679 char buffer[21];
680 /* use abuffer instead of buffer, if we need more space
681 * (which can happen if there's a format specifier with width). */
682 char *abuffer = NULL;
683 char *realbuffer;
684 Py_ssize_t abuffersize = 0;
685 char fmt[60]; /* should be enough for %0width.precisionld */
686 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000687
688#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000689 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000690#else
691#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000692 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000693#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000694 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000695#endif
696#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000697 /* step 1: count the number of %S/%R/%s format specifications
698 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
699 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000700 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000701 if (*f == '%') {
702 if (*(f+1)=='%')
703 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000704 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000705 ++callcount;
706 while (isdigit((unsigned)*f))
707 width = (width*10) + *f++ - '0';
708 while (*++f && *f != '%' && !isalpha((unsigned)*f))
709 ;
710 if (*f == 's')
711 ++callcount;
712 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000713 }
714 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000715 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000716 if (callcount) {
717 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
718 if (!callresults) {
719 PyErr_NoMemory();
720 return NULL;
721 }
722 callresult = callresults;
723 }
724 /* step 3: figure out how large a buffer we need */
725 for (f = format; *f; f++) {
726 if (*f == '%') {
727 const char* p = f;
728 width = 0;
729 while (isdigit((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 while (*++f && *f != '%' && !isalpha((unsigned)*f))
732 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000733
Benjamin Peterson857ce152009-01-31 16:29:18 +0000734 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
735 * they don't affect the amount of space we reserve.
736 */
737 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000738 (f[1] == 'd' || f[1] == 'u'))
739 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000740
Benjamin Peterson857ce152009-01-31 16:29:18 +0000741 switch (*f) {
742 case 'c':
743 (void)va_arg(count, int);
744 /* fall through... */
745 case '%':
746 n++;
747 break;
748 case 'd': case 'u': case 'i': case 'x':
749 (void) va_arg(count, int);
750 /* 20 bytes is enough to hold a 64-bit
751 integer. Decimal takes the most space.
752 This isn't enough for octal.
753 If a width is specified we need more
754 (which we allocate later). */
755 if (width < 20)
756 width = 20;
757 n += width;
758 if (abuffersize < width)
759 abuffersize = width;
760 break;
761 case 's':
762 {
763 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000764 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000765 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
766 if (!str)
767 goto fail;
768 n += PyUnicode_GET_SIZE(str);
769 /* Remember the str and switch to the next slot */
770 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000771 break;
772 }
773 case 'U':
774 {
775 PyObject *obj = va_arg(count, PyObject *);
776 assert(obj && PyUnicode_Check(obj));
777 n += PyUnicode_GET_SIZE(obj);
778 break;
779 }
780 case 'V':
781 {
782 PyObject *obj = va_arg(count, PyObject *);
783 const char *str = va_arg(count, const char *);
784 assert(obj || str);
785 assert(!obj || PyUnicode_Check(obj));
786 if (obj)
787 n += PyUnicode_GET_SIZE(obj);
788 else
789 n += strlen(str);
790 break;
791 }
792 case 'S':
793 {
794 PyObject *obj = va_arg(count, PyObject *);
795 PyObject *str;
796 assert(obj);
797 str = PyObject_Str(obj);
798 if (!str)
799 goto fail;
800 n += PyUnicode_GET_SIZE(str);
801 /* Remember the str and switch to the next slot */
802 *callresult++ = str;
803 break;
804 }
805 case 'R':
806 {
807 PyObject *obj = va_arg(count, PyObject *);
808 PyObject *repr;
809 assert(obj);
810 repr = PyObject_Repr(obj);
811 if (!repr)
812 goto fail;
813 n += PyUnicode_GET_SIZE(repr);
814 /* Remember the repr and switch to the next slot */
815 *callresult++ = repr;
816 break;
817 }
818 case 'p':
819 (void) va_arg(count, int);
820 /* maximum 64-bit pointer representation:
821 * 0xffffffffffffffff
822 * so 19 characters is enough.
823 * XXX I count 18 -- what's the extra for?
824 */
825 n += 19;
826 break;
827 default:
828 /* if we stumble upon an unknown
829 formatting code, copy the rest of
830 the format string to the output
831 string. (we cannot just skip the
832 code, since there's no way to know
833 what's in the argument list) */
834 n += strlen(p);
835 goto expand;
836 }
837 } else
838 n++;
839 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000840 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000841 if (abuffersize > 20) {
842 abuffer = PyObject_Malloc(abuffersize);
843 if (!abuffer) {
844 PyErr_NoMemory();
845 goto fail;
846 }
847 realbuffer = abuffer;
848 }
849 else
850 realbuffer = buffer;
851 /* step 4: fill the buffer */
852 /* Since we've analyzed how much space we need for the worst case,
853 we don't have to resize the string.
854 There can be no errors beyond this point. */
855 string = PyUnicode_FromUnicode(NULL, n);
856 if (!string)
857 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000858
Benjamin Peterson857ce152009-01-31 16:29:18 +0000859 s = PyUnicode_AS_UNICODE(string);
860 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000861
Benjamin Peterson857ce152009-01-31 16:29:18 +0000862 for (f = format; *f; f++) {
863 if (*f == '%') {
864 const char* p = f++;
865 int longflag = 0;
866 int size_tflag = 0;
867 zeropad = (*f == '0');
868 /* parse the width.precision part */
869 width = 0;
870 while (isdigit((unsigned)*f))
871 width = (width*10) + *f++ - '0';
872 precision = 0;
873 if (*f == '.') {
874 f++;
875 while (isdigit((unsigned)*f))
876 precision = (precision*10) + *f++ - '0';
877 }
878 /* handle the long flag, but only for %ld and %lu.
879 others can be added when necessary. */
880 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
881 longflag = 1;
882 ++f;
883 }
884 /* handle the size_t flag. */
885 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
886 size_tflag = 1;
887 ++f;
888 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000889
Benjamin Peterson857ce152009-01-31 16:29:18 +0000890 switch (*f) {
891 case 'c':
892 *s++ = va_arg(vargs, int);
893 break;
894 case 'd':
895 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
896 if (longflag)
897 sprintf(realbuffer, fmt, va_arg(vargs, long));
898 else if (size_tflag)
899 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
900 else
901 sprintf(realbuffer, fmt, va_arg(vargs, int));
902 appendstring(realbuffer);
903 break;
904 case 'u':
905 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
906 if (longflag)
907 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
908 else if (size_tflag)
909 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
910 else
911 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
912 appendstring(realbuffer);
913 break;
914 case 'i':
915 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
916 sprintf(realbuffer, fmt, va_arg(vargs, int));
917 appendstring(realbuffer);
918 break;
919 case 'x':
920 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
921 sprintf(realbuffer, fmt, va_arg(vargs, int));
922 appendstring(realbuffer);
923 break;
924 case 's':
925 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000926 /* unused, since we already have the result */
927 (void) va_arg(vargs, char *);
928 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
929 PyUnicode_GET_SIZE(*callresult));
930 s += PyUnicode_GET_SIZE(*callresult);
931 /* We're done with the unicode()/repr() => forget it */
932 Py_DECREF(*callresult);
933 /* switch to next unicode()/repr() result */
934 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000935 break;
936 }
937 case 'U':
938 {
939 PyObject *obj = va_arg(vargs, PyObject *);
940 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
941 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
942 s += size;
943 break;
944 }
945 case 'V':
946 {
947 PyObject *obj = va_arg(vargs, PyObject *);
948 const char *str = va_arg(vargs, const char *);
949 if (obj) {
950 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
951 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
952 s += size;
953 } else {
954 appendstring(str);
955 }
956 break;
957 }
958 case 'S':
959 case 'R':
960 {
961 Py_UNICODE *ucopy;
962 Py_ssize_t usize;
963 Py_ssize_t upos;
964 /* unused, since we already have the result */
965 (void) va_arg(vargs, PyObject *);
966 ucopy = PyUnicode_AS_UNICODE(*callresult);
967 usize = PyUnicode_GET_SIZE(*callresult);
968 for (upos = 0; upos<usize;)
969 *s++ = ucopy[upos++];
970 /* We're done with the unicode()/repr() => forget it */
971 Py_DECREF(*callresult);
972 /* switch to next unicode()/repr() result */
973 ++callresult;
974 break;
975 }
976 case 'p':
977 sprintf(buffer, "%p", va_arg(vargs, void*));
978 /* %p is ill-defined: ensure leading 0x. */
979 if (buffer[1] == 'X')
980 buffer[1] = 'x';
981 else if (buffer[1] != 'x') {
982 memmove(buffer+2, buffer, strlen(buffer)+1);
983 buffer[0] = '0';
984 buffer[1] = 'x';
985 }
986 appendstring(buffer);
987 break;
988 case '%':
989 *s++ = '%';
990 break;
991 default:
992 appendstring(p);
993 goto end;
994 }
995 } else
996 *s++ = *f;
997 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000998
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000999 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001000 if (callresults)
1001 PyObject_Free(callresults);
1002 if (abuffer)
1003 PyObject_Free(abuffer);
1004 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1005 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001006 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001007 if (callresults) {
1008 PyObject **callresult2 = callresults;
1009 while (callresult2 < callresult) {
1010 Py_DECREF(*callresult2);
1011 ++callresult2;
1012 }
1013 PyObject_Free(callresults);
1014 }
1015 if (abuffer)
1016 PyObject_Free(abuffer);
1017 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001018}
1019
1020#undef appendstring
1021
1022PyObject *
1023PyUnicode_FromFormat(const char *format, ...)
1024{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001025 PyObject* ret;
1026 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001027
1028#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001029 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001030#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001031 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001032#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001033 ret = PyUnicode_FromFormatV(format, vargs);
1034 va_end(vargs);
1035 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001036}
1037
Martin v. Löwis18e16552006-02-15 17:27:45 +00001038Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001039 wchar_t *w,
1040 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041{
1042 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001043 PyErr_BadInternalCall();
1044 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001046
1047 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001048 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001049 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001050
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051#ifdef HAVE_USABLE_WCHAR_T
1052 memcpy(w, unicode->str, size * sizeof(wchar_t));
1053#else
1054 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001055 register Py_UNICODE *u;
1056 register Py_ssize_t i;
1057 u = PyUnicode_AS_UNICODE(unicode);
1058 for (i = size; i > 0; i--)
1059 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 }
1061#endif
1062
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001063 if (size > PyUnicode_GET_SIZE(unicode))
1064 return PyUnicode_GET_SIZE(unicode);
1065 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001066 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067}
1068
1069#endif
1070
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001071PyObject *PyUnicode_FromOrdinal(int ordinal)
1072{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001073 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001074
1075#ifdef Py_UNICODE_WIDE
1076 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001077 PyErr_SetString(PyExc_ValueError,
1078 "unichr() arg not in range(0x110000) "
1079 "(wide Python build)");
1080 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001081 }
1082#else
1083 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001084 PyErr_SetString(PyExc_ValueError,
1085 "unichr() arg not in range(0x10000) "
1086 "(narrow Python build)");
1087 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001088 }
1089#endif
1090
Hye-Shik Chang40574832004-04-06 07:24:51 +00001091 s[0] = (Py_UNICODE)ordinal;
1092 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001093}
1094
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095PyObject *PyUnicode_FromObject(register PyObject *obj)
1096{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001097 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001098 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001100 Py_INCREF(obj);
1101 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001102 }
1103 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001104 /* For a Unicode subtype that's not a Unicode object,
1105 return a true Unicode object with the same data. */
1106 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1107 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001108 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1110}
1111
1112PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001113 const char *encoding,
1114 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001115{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001116 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001117 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001118 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001119
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001121 PyErr_BadInternalCall();
1122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001124
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001125#if 0
1126 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001127 that no encodings is given and then redirect to
1128 PyObject_Unicode() which then applies the additional logic for
1129 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001130
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001131 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001132 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001133
1134 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001135 if (PyUnicode_Check(obj)) {
1136 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001137 PyErr_SetString(PyExc_TypeError,
1138 "decoding Unicode is not supported");
1139 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001140 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001141 return PyObject_Unicode(obj);
1142 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001143#else
1144 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001145 PyErr_SetString(PyExc_TypeError,
1146 "decoding Unicode is not supported");
1147 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001148 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001149#endif
1150
1151 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001152 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001153 s = PyString_AS_STRING(obj);
1154 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001155 }
Christian Heimes3497f942008-05-26 12:29:14 +00001156 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001157 /* Python 2.x specific */
1158 PyErr_Format(PyExc_TypeError,
1159 "decoding bytearray is not supported");
1160 return NULL;
1161 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001162 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001163 /* Overwrite the error message with something more useful in
1164 case of a TypeError. */
1165 if (PyErr_ExceptionMatches(PyExc_TypeError))
1166 PyErr_Format(PyExc_TypeError,
1167 "coercing to Unicode: need string or buffer, "
1168 "%.80s found",
1169 Py_TYPE(obj)->tp_name);
1170 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001171 }
Tim Petersced69f82003-09-16 20:30:58 +00001172
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001173 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001174 if (len == 0)
1175 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001176
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001177 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001178 return v;
1179
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001180 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001181 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182}
1183
1184PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001185 Py_ssize_t size,
1186 const char *encoding,
1187 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188{
1189 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001190
1191 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001192 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001193
1194 /* Shortcuts for common default encodings */
1195 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001197 else if (strcmp(encoding, "latin-1") == 0)
1198 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001199#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1200 else if (strcmp(encoding, "mbcs") == 0)
1201 return PyUnicode_DecodeMBCS(s, size, errors);
1202#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001203 else if (strcmp(encoding, "ascii") == 0)
1204 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205
1206 /* Decode via the codec registry */
1207 buffer = PyBuffer_FromMemory((void *)s, size);
1208 if (buffer == NULL)
1209 goto onError;
1210 unicode = PyCodec_Decode(buffer, encoding, errors);
1211 if (unicode == NULL)
1212 goto onError;
1213 if (!PyUnicode_Check(unicode)) {
1214 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001215 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001216 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 Py_DECREF(unicode);
1218 goto onError;
1219 }
1220 Py_DECREF(buffer);
1221 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001222
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001223 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 Py_XDECREF(buffer);
1225 return NULL;
1226}
1227
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001228PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1229 const char *encoding,
1230 const char *errors)
1231{
1232 PyObject *v;
1233
1234 if (!PyUnicode_Check(unicode)) {
1235 PyErr_BadArgument();
1236 goto onError;
1237 }
1238
1239 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001240 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001241
1242 /* Decode via the codec registry */
1243 v = PyCodec_Decode(unicode, encoding, errors);
1244 if (v == NULL)
1245 goto onError;
1246 return v;
1247
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001248 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001249 return NULL;
1250}
1251
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001253 Py_ssize_t size,
1254 const char *encoding,
1255 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256{
1257 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001258
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 unicode = PyUnicode_FromUnicode(s, size);
1260 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1263 Py_DECREF(unicode);
1264 return v;
1265}
1266
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001267PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1268 const char *encoding,
1269 const char *errors)
1270{
1271 PyObject *v;
1272
1273 if (!PyUnicode_Check(unicode)) {
1274 PyErr_BadArgument();
1275 goto onError;
1276 }
1277
1278 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001279 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001280
1281 /* Encode via the codec registry */
1282 v = PyCodec_Encode(unicode, encoding, errors);
1283 if (v == NULL)
1284 goto onError;
1285 return v;
1286
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001287 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001288 return NULL;
1289}
1290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1292 const char *encoding,
1293 const char *errors)
1294{
1295 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001296
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 if (!PyUnicode_Check(unicode)) {
1298 PyErr_BadArgument();
1299 goto onError;
1300 }
Fred Drakee4315f52000-05-09 19:53:39 +00001301
Tim Petersced69f82003-09-16 20:30:58 +00001302 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001303 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001304
1305 /* Shortcuts for common default encodings */
1306 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001307 if (strcmp(encoding, "utf-8") == 0)
1308 return PyUnicode_AsUTF8String(unicode);
1309 else if (strcmp(encoding, "latin-1") == 0)
1310 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001311#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001312 else if (strcmp(encoding, "mbcs") == 0)
1313 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001314#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001315 else if (strcmp(encoding, "ascii") == 0)
1316 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318
1319 /* Encode via the codec registry */
1320 v = PyCodec_Encode(unicode, encoding, errors);
1321 if (v == NULL)
1322 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001323 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001325 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001326 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 Py_DECREF(v);
1328 goto onError;
1329 }
1330 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001331
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001332 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333 return NULL;
1334}
1335
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001336PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001337 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001338{
1339 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1340
1341 if (v)
1342 return v;
1343 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1344 if (v && errors == NULL)
1345 ((PyUnicodeObject *)unicode)->defenc = v;
1346 return v;
1347}
1348
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1350{
1351 if (!PyUnicode_Check(unicode)) {
1352 PyErr_BadArgument();
1353 goto onError;
1354 }
1355 return PyUnicode_AS_UNICODE(unicode);
1356
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001357 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358 return NULL;
1359}
1360
Martin v. Löwis18e16552006-02-15 17:27:45 +00001361Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362{
1363 if (!PyUnicode_Check(unicode)) {
1364 PyErr_BadArgument();
1365 goto onError;
1366 }
1367 return PyUnicode_GET_SIZE(unicode);
1368
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001369 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 return -1;
1371}
1372
Thomas Wouters78890102000-07-22 19:25:51 +00001373const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001374{
1375 return unicode_default_encoding;
1376}
1377
1378int PyUnicode_SetDefaultEncoding(const char *encoding)
1379{
1380 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001381
Fred Drakee4315f52000-05-09 19:53:39 +00001382 /* Make sure the encoding is valid. As side effect, this also
1383 loads the encoding into the codec registry cache. */
1384 v = _PyCodec_Lookup(encoding);
1385 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001386 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001387 Py_DECREF(v);
1388 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001390 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001391 return 0;
1392
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001393 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001394 return -1;
1395}
1396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397/* error handling callback helper:
1398 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001399 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001400 and adjust various state variables.
1401 return 0 on success, -1 on error
1402*/
1403
1404static
1405int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001406 const char *encoding, const char *reason,
1407 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1408 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1409 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001412
1413 PyObject *restuple = NULL;
1414 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1416 Py_ssize_t requiredsize;
1417 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001418 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001419 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 int res = -1;
1421
1422 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001423 *errorHandler = PyCodec_LookupError(errors);
1424 if (*errorHandler == NULL)
1425 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426 }
1427
1428 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001429 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001430 encoding, input, insize, *startinpos, *endinpos, reason);
1431 if (*exceptionObject == NULL)
1432 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433 }
1434 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001435 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1436 goto onError;
1437 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1438 goto onError;
1439 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1440 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001441 }
1442
1443 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1444 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001445 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001447 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001448 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 }
1450 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001451 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001453 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001454 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001455 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1456 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001458
1459 /* need more space? (at least enough for what we
1460 have+the replacement+the rest of the string (starting
1461 at the new input position), so we won't have to check space
1462 when there are no errors in the rest of the string) */
1463 repptr = PyUnicode_AS_UNICODE(repunicode);
1464 repsize = PyUnicode_GET_SIZE(repunicode);
1465 requiredsize = *outpos + repsize + insize-newpos;
1466 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001467 if (requiredsize<2*outsize)
1468 requiredsize = 2*outsize;
1469 if (_PyUnicode_Resize(output, requiredsize) < 0)
1470 goto onError;
1471 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 }
1473 *endinpos = newpos;
1474 *inptr = input + newpos;
1475 Py_UNICODE_COPY(*outptr, repptr, repsize);
1476 *outptr += repsize;
1477 *outpos += repsize;
1478 /* we made it! */
1479 res = 0;
1480
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001481 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001482 Py_XDECREF(restuple);
1483 return res;
1484}
1485
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001486/* --- UTF-7 Codec -------------------------------------------------------- */
1487
Antoine Pitrou653dece2009-05-04 18:32:32 +00001488/* See RFC2152 for details. We encode conservatively and decode liberally. */
1489
1490/* Three simple macros defining base-64. */
1491
1492/* Is c a base-64 character? */
1493
1494#define IS_BASE64(c) \
1495 (isalnum(c) || (c) == '+' || (c) == '/')
1496
1497/* given that c is a base-64 character, what is its base-64 value? */
1498
1499#define FROM_BASE64(c) \
1500 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1501 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1502 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1503 (c) == '+' ? 62 : 63)
1504
1505/* What is the base-64 character of the bottom 6 bits of n? */
1506
1507#define TO_BASE64(n) \
1508 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1509
1510/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1511 * decoded as itself. We are permissive on decoding; the only ASCII
1512 * byte not decoding to itself is the + which begins a base64
1513 * string. */
1514
1515#define DECODE_DIRECT(c) \
1516 ((c) <= 127 && (c) != '+')
1517
1518/* The UTF-7 encoder treats ASCII characters differently according to
1519 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1520 * the above). See RFC2152. This array identifies these different
1521 * sets:
1522 * 0 : "Set D"
1523 * alphanumeric and '(),-./:?
1524 * 1 : "Set O"
1525 * !"#$%&*;<=>@[]^_`{|}
1526 * 2 : "whitespace"
1527 * ht nl cr sp
1528 * 3 : special (must be base64 encoded)
1529 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1530 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001531
Tim Petersced69f82003-09-16 20:30:58 +00001532static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001533char utf7_category[128] = {
1534/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1535 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1536/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1537 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1538/* sp ! " # $ % & ' ( ) * + , - . / */
1539 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1540/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1542/* @ A B C D E F G H I J K L M N O */
1543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1544/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1546/* ` a b c d e f g h i j k l m n o */
1547 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1548/* p q r s t u v w x y z { | } ~ del */
1549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550};
1551
Antoine Pitrou653dece2009-05-04 18:32:32 +00001552/* ENCODE_DIRECT: this character should be encoded as itself. The
1553 * answer depends on whether we are encoding set O as itself, and also
1554 * on whether we are encoding whitespace as itself. RFC2152 makes it
1555 * clear that the answers to these questions vary between
1556 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001557
Antoine Pitrou653dece2009-05-04 18:32:32 +00001558#define ENCODE_DIRECT(c, directO, directWS) \
1559 ((c) < 128 && (c) > 0 && \
1560 ((utf7_category[(c)] == 0) || \
1561 (directWS && (utf7_category[(c)] == 2)) || \
1562 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001564PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001565 Py_ssize_t size,
1566 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001568 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1569}
1570
Antoine Pitrou653dece2009-05-04 18:32:32 +00001571/* The decoder. The only state we preserve is our read position,
1572 * i.e. how many characters we have consumed. So if we end in the
1573 * middle of a shift sequence we have to back off the read position
1574 * and the output to the beginning of the sequence, otherwise we lose
1575 * all the shift state (seen bits, number of bits seen, high
1576 * surrogate). */
1577
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001578PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001579 Py_ssize_t size,
1580 const char *errors,
1581 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001582{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001583 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001584 Py_ssize_t startinpos;
1585 Py_ssize_t endinpos;
1586 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 const char *e;
1588 PyUnicodeObject *unicode;
1589 Py_UNICODE *p;
1590 const char *errmsg = "";
1591 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001592 Py_UNICODE *shiftOutStart;
1593 unsigned int base64bits = 0;
1594 unsigned long base64buffer = 0;
1595 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 PyObject *errorHandler = NULL;
1597 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598
1599 unicode = _PyUnicode_New(size);
1600 if (!unicode)
1601 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001602 if (size == 0) {
1603 if (consumed)
1604 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001605 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001606 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607
1608 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 e = s + size;
1611
1612 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001613 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001614
Antoine Pitrou653dece2009-05-04 18:32:32 +00001615 if (inShift) { /* in a base-64 section */
1616 if (IS_BASE64(ch)) { /* consume a base-64 character */
1617 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1618 base64bits += 6;
1619 s++;
1620 if (base64bits >= 16) {
1621 /* we have enough bits for a UTF-16 value */
1622 Py_UNICODE outCh = (Py_UNICODE)
1623 (base64buffer >> (base64bits-16));
1624 base64bits -= 16;
1625 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1626 if (surrogate) {
1627 /* expecting a second surrogate */
1628 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1629#ifdef Py_UNICODE_WIDE
1630 *p++ = (((surrogate & 0x3FF)<<10)
1631 | (outCh & 0x3FF)) + 0x10000;
1632#else
1633 *p++ = surrogate;
1634 *p++ = outCh;
1635#endif
1636 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001637 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001638 }
1639 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001640 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001641 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001642 }
1643 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001644 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001645 /* first surrogate */
1646 surrogate = outCh;
1647 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001648 else {
1649 *p++ = outCh;
1650 }
1651 }
1652 }
1653 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 inShift = 0;
1655 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001656 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001657 *p++ = surrogate;
1658 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 if (base64bits > 0) { /* left-over bits */
1661 if (base64bits >= 6) {
1662 /* We've seen at least one base-64 character */
1663 errmsg = "partial character in shift sequence";
1664 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001666 else {
1667 /* Some bits remain; they should be zero */
1668 if (base64buffer != 0) {
1669 errmsg = "non-zero padding bits in shift sequence";
1670 goto utf7Error;
1671 }
1672 }
1673 }
1674 if (ch != '-') {
1675 /* '-' is absorbed; other terminating
1676 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 *p++ = ch;
1678 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 }
1680 }
1681 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001682 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001683 s++; /* consume '+' */
1684 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 s++;
1686 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001687 }
1688 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 shiftOutStart = p;
1691 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 }
1693 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001694 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 *p++ = ch;
1696 s++;
1697 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001698 else {
1699 startinpos = s-starts;
1700 s++;
1701 errmsg = "unexpected special character";
1702 goto utf7Error;
1703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001704 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 outpos = p-PyUnicode_AS_UNICODE(unicode);
1707 endinpos = s-starts;
1708 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001709 errors, &errorHandler,
1710 "utf7", errmsg,
1711 starts, size, &startinpos, &endinpos, &exc, &s,
1712 &unicode, &outpos, &p))
1713 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 }
1715
Antoine Pitrou653dece2009-05-04 18:32:32 +00001716 /* end of string */
1717
1718 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1719 /* if we're in an inconsistent state, that's an error */
1720 if (surrogate ||
1721 (base64bits >= 6) ||
1722 (base64bits > 0 && base64buffer != 0)) {
1723 outpos = p-PyUnicode_AS_UNICODE(unicode);
1724 endinpos = size;
1725 if (unicode_decode_call_errorhandler(
1726 errors, &errorHandler,
1727 "utf7", "unterminated shift sequence",
1728 starts, size, &startinpos, &endinpos, &exc, &s,
1729 &unicode, &outpos, &p))
1730 goto onError;
1731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733
1734 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001735 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 if (inShift) {
1737 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001738 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001739 }
1740 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001741 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001742 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001743 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001745 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 goto onError;
1747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 Py_XDECREF(errorHandler);
1749 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 return (PyObject *)unicode;
1751
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 Py_XDECREF(errorHandler);
1754 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 Py_DECREF(unicode);
1756 return NULL;
1757}
1758
1759
1760PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001761 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001762 int base64SetO,
1763 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001764 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765{
1766 PyObject *v;
1767 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001768 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001769 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001771 unsigned int base64bits = 0;
1772 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 char * out;
1774 char * start;
1775
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001776 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001777 return PyErr_NoMemory();
1778
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001780 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781
Antoine Pitrou653dece2009-05-04 18:32:32 +00001782 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783 if (v == NULL)
1784 return NULL;
1785
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001786 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 for (;i < size; ++i) {
1788 Py_UNICODE ch = s[i];
1789
Antoine Pitrou653dece2009-05-04 18:32:32 +00001790 if (inShift) {
1791 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1792 /* shifting out */
1793 if (base64bits) { /* output remaining bits */
1794 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1795 base64buffer = 0;
1796 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
1798 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001799 /* Characters not in the BASE64 set implicitly unshift the sequence
1800 so no '-' is required, except if the character is itself a '-' */
1801 if (IS_BASE64(ch) || ch == '-') {
1802 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 *out++ = (char) ch;
1805 }
1806 else {
1807 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001808 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001809 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001810 else { /* not in a shift sequence */
1811 if (ch == '+') {
1812 *out++ = '+';
1813 *out++ = '-';
1814 }
1815 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1816 *out++ = (char) ch;
1817 }
1818 else {
1819 *out++ = '+';
1820 inShift = 1;
1821 goto encode_char;
1822 }
1823 }
1824 continue;
1825encode_char:
1826#ifdef Py_UNICODE_WIDE
1827 if (ch >= 0x10000) {
1828 /* code first surrogate */
1829 base64bits += 16;
1830 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1831 while (base64bits >= 6) {
1832 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1833 base64bits -= 6;
1834 }
1835 /* prepare second surrogate */
1836 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1837 }
1838#endif
1839 base64bits += 16;
1840 base64buffer = (base64buffer << 16) | ch;
1841 while (base64bits >= 6) {
1842 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1843 base64bits -= 6;
1844 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001845 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001846 if (base64bits)
1847 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1848 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001849 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001851 if (_PyString_Resize(&v, out - start))
1852 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 return v;
1854}
1855
Antoine Pitrou653dece2009-05-04 18:32:32 +00001856#undef IS_BASE64
1857#undef FROM_BASE64
1858#undef TO_BASE64
1859#undef DECODE_DIRECT
1860#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001861
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862/* --- UTF-8 Codec -------------------------------------------------------- */
1863
Tim Petersced69f82003-09-16 20:30:58 +00001864static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001866 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1867 illegal prefix. See RFC 3629 for details */
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001875 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1880 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1881 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1882 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1883 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884};
1885
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001887 Py_ssize_t size,
1888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889{
Walter Dörwald69652032004-09-07 20:24:22 +00001890 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1891}
1892
1893PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001894 Py_ssize_t size,
1895 const char *errors,
1896 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001900 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001901 Py_ssize_t startinpos;
1902 Py_ssize_t endinpos;
1903 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 const char *e;
1905 PyUnicodeObject *unicode;
1906 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001907 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001908 PyObject *errorHandler = NULL;
1909 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910
1911 /* Note: size will always be longer than the resulting Unicode
1912 character count */
1913 unicode = _PyUnicode_New(size);
1914 if (!unicode)
1915 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001916 if (size == 0) {
1917 if (consumed)
1918 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001920 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921
1922 /* Unpack UTF-8 encoded data */
1923 p = unicode->str;
1924 e = s + size;
1925
1926 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001927 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928
1929 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001930 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 s++;
1932 continue;
1933 }
1934
1935 n = utf8_code_length[ch];
1936
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001938 if (consumed)
1939 break;
1940 else {
1941 errmsg = "unexpected end of data";
1942 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001943 endinpos = startinpos+1;
1944 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1945 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001946 goto utf8Error;
1947 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949
1950 switch (n) {
1951
1952 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001953 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957
1958 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001960 startinpos = s-starts;
1961 endinpos = startinpos+1;
1962 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963
1964 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001965 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001966 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001967 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001968 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001969 goto utf8Error;
1970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001972 assert ((ch > 0x007F) && (ch <= 0x07FF));
1973 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 break;
1975
1976 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001977 /* XXX: surrogates shouldn't be valid UTF-8!
1978 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1979 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1980 Uncomment the 2 lines below to make them invalid,
1981 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001982 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00001983 (s[2] & 0xc0) != 0x80 ||
1984 ((unsigned char)s[0] == 0xE0 &&
1985 (unsigned char)s[1] < 0xA0)/* ||
1986 ((unsigned char)s[0] == 0xED &&
1987 (unsigned char)s[1] > 0x9F)*/) {
1988 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001989 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001990 endinpos = startinpos + 1;
1991
1992 /* if s[1] first two bits are 1 and 0, then the invalid
1993 continuation byte is s[2], so increment endinpos by 1,
1994 if not, s[1] is invalid and endinpos doesn't need to
1995 be incremented. */
1996 if ((s[1] & 0xC0) == 0x80)
1997 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001998 goto utf8Error;
1999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002001 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2002 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002003 break;
2004
2005 case 4:
2006 if ((s[1] & 0xc0) != 0x80 ||
2007 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002008 (s[3] & 0xc0) != 0x80 ||
2009 ((unsigned char)s[0] == 0xF0 &&
2010 (unsigned char)s[1] < 0x90) ||
2011 ((unsigned char)s[0] == 0xF4 &&
2012 (unsigned char)s[1] > 0x8F)) {
2013 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002014 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002015 endinpos = startinpos + 1;
2016 if ((s[1] & 0xC0) == 0x80) {
2017 endinpos++;
2018 if ((s[2] & 0xC0) == 0x80)
2019 endinpos++;
2020 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002021 goto utf8Error;
2022 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002023 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002024 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2025 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2026
Fredrik Lundh8f455852001-06-27 18:59:43 +00002027#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002028 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002029#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002030 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 /* translate from 10000..10FFFF to 0..FFFF */
2033 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002034
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002035 /* high surrogate = top 10 bits added to D800 */
2036 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002037
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002038 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002039 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002040#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 }
2043 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002044 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002045
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002046 utf8Error:
2047 outpos = p-PyUnicode_AS_UNICODE(unicode);
2048 if (unicode_decode_call_errorhandler(
2049 errors, &errorHandler,
2050 "utf8", errmsg,
2051 starts, size, &startinpos, &endinpos, &exc, &s,
2052 &unicode, &outpos, &p))
2053 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 }
Walter Dörwald69652032004-09-07 20:24:22 +00002055 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002056 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057
2058 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002059 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 goto onError;
2061
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 Py_XDECREF(errorHandler);
2063 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064 return (PyObject *)unicode;
2065
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002066 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 Py_XDECREF(errorHandler);
2068 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 Py_DECREF(unicode);
2070 return NULL;
2071}
2072
Tim Peters602f7402002-04-27 18:03:26 +00002073/* Allocation strategy: if the string is short, convert into a stack buffer
2074 and allocate exactly as much space needed at the end. Else allocate the
2075 maximum possible needed (4 result bytes per Unicode character), and return
2076 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002077*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002078PyObject *
2079PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002080 Py_ssize_t size,
2081 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082{
Tim Peters602f7402002-04-27 18:03:26 +00002083#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002084
Martin v. Löwis18e16552006-02-15 17:27:45 +00002085 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002086 PyObject *v; /* result string object */
2087 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002088 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002089 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002090 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002091
Tim Peters602f7402002-04-27 18:03:26 +00002092 assert(s != NULL);
2093 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094
Tim Peters602f7402002-04-27 18:03:26 +00002095 if (size <= MAX_SHORT_UNICHARS) {
2096 /* Write into the stack buffer; nallocated can't overflow.
2097 * At the end, we'll allocate exactly as much heap space as it
2098 * turns out we need.
2099 */
2100 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2101 v = NULL; /* will allocate after we're done */
2102 p = stackbuf;
2103 }
2104 else {
2105 /* Overallocate on the heap, and give the excess back at the end. */
2106 nallocated = size * 4;
2107 if (nallocated / 4 != size) /* overflow! */
2108 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002109 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002110 if (v == NULL)
2111 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002112 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002113 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002114
Tim Peters602f7402002-04-27 18:03:26 +00002115 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002116 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002117
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002118 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002119 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002123 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002124 *p++ = (char)(0xc0 | (ch >> 6));
2125 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002126 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002127 else {
Tim Peters602f7402002-04-27 18:03:26 +00002128 /* Encode UCS2 Unicode ordinals */
2129 if (ch < 0x10000) {
2130 /* Special case: check for high surrogate */
2131 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2132 Py_UCS4 ch2 = s[i];
2133 /* Check for low surrogate and combine the two to
2134 form a UCS4 value */
2135 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002136 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002137 i++;
2138 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002139 }
Tim Peters602f7402002-04-27 18:03:26 +00002140 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002141 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002142 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002143 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2144 *p++ = (char)(0x80 | (ch & 0x3f));
2145 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002146 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002147 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002148 /* Encode UCS4 Unicode ordinals */
2149 *p++ = (char)(0xf0 | (ch >> 18));
2150 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2151 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2152 *p++ = (char)(0x80 | (ch & 0x3f));
2153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002155
Tim Peters602f7402002-04-27 18:03:26 +00002156 if (v == NULL) {
2157 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002158 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002160 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002161 }
2162 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002163 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002164 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002165 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002166 if (_PyString_Resize(&v, nneeded))
2167 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002170
Tim Peters602f7402002-04-27 18:03:26 +00002171#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172}
2173
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2175{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 if (!PyUnicode_Check(unicode)) {
2177 PyErr_BadArgument();
2178 return NULL;
2179 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002180 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002181 PyUnicode_GET_SIZE(unicode),
2182 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183}
2184
Walter Dörwald6e390802007-08-17 16:41:28 +00002185/* --- UTF-32 Codec ------------------------------------------------------- */
2186
2187PyObject *
2188PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002189 Py_ssize_t size,
2190 const char *errors,
2191 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002192{
2193 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2194}
2195
2196PyObject *
2197PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002198 Py_ssize_t size,
2199 const char *errors,
2200 int *byteorder,
2201 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002202{
2203 const char *starts = s;
2204 Py_ssize_t startinpos;
2205 Py_ssize_t endinpos;
2206 Py_ssize_t outpos;
2207 PyUnicodeObject *unicode;
2208 Py_UNICODE *p;
2209#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002210 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002211 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002212#else
2213 const int pairs = 0;
2214#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002215 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002216 int bo = 0; /* assume native ordering by default */
2217 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002218 /* Offsets from q for retrieving bytes in the right order. */
2219#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2220 int iorder[] = {0, 1, 2, 3};
2221#else
2222 int iorder[] = {3, 2, 1, 0};
2223#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002224 PyObject *errorHandler = NULL;
2225 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002226
Walter Dörwald6e390802007-08-17 16:41:28 +00002227 q = (unsigned char *)s;
2228 e = q + size;
2229
2230 if (byteorder)
2231 bo = *byteorder;
2232
2233 /* Check for BOM marks (U+FEFF) in the input and adjust current
2234 byte order setting accordingly. In native mode, the leading BOM
2235 mark is skipped, in all other modes, it is copied to the output
2236 stream as-is (giving a ZWNBSP character). */
2237 if (bo == 0) {
2238 if (size >= 4) {
2239 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002240 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002241#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002242 if (bom == 0x0000FEFF) {
2243 q += 4;
2244 bo = -1;
2245 }
2246 else if (bom == 0xFFFE0000) {
2247 q += 4;
2248 bo = 1;
2249 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002250#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002251 if (bom == 0x0000FEFF) {
2252 q += 4;
2253 bo = 1;
2254 }
2255 else if (bom == 0xFFFE0000) {
2256 q += 4;
2257 bo = -1;
2258 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002259#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002260 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002261 }
2262
2263 if (bo == -1) {
2264 /* force LE */
2265 iorder[0] = 0;
2266 iorder[1] = 1;
2267 iorder[2] = 2;
2268 iorder[3] = 3;
2269 }
2270 else if (bo == 1) {
2271 /* force BE */
2272 iorder[0] = 3;
2273 iorder[1] = 2;
2274 iorder[2] = 1;
2275 iorder[3] = 0;
2276 }
2277
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002278 /* On narrow builds we split characters outside the BMP into two
2279 codepoints => count how much extra space we need. */
2280#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002281 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002282 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2283 pairs++;
2284#endif
2285
2286 /* This might be one to much, because of a BOM */
2287 unicode = _PyUnicode_New((size+3)/4+pairs);
2288 if (!unicode)
2289 return NULL;
2290 if (size == 0)
2291 return (PyObject *)unicode;
2292
2293 /* Unpack UTF-32 encoded data */
2294 p = unicode->str;
2295
Walter Dörwald6e390802007-08-17 16:41:28 +00002296 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002297 Py_UCS4 ch;
2298 /* remaining bytes at the end? (size should be divisible by 4) */
2299 if (e-q<4) {
2300 if (consumed)
2301 break;
2302 errmsg = "truncated data";
2303 startinpos = ((const char *)q)-starts;
2304 endinpos = ((const char *)e)-starts;
2305 goto utf32Error;
2306 /* The remaining input chars are ignored if the callback
2307 chooses to skip the input */
2308 }
2309 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2310 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002311
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002312 if (ch >= 0x110000)
2313 {
2314 errmsg = "codepoint not in range(0x110000)";
2315 startinpos = ((const char *)q)-starts;
2316 endinpos = startinpos+4;
2317 goto utf32Error;
2318 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002319#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002320 if (ch >= 0x10000)
2321 {
2322 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2323 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2324 }
2325 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002326#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002327 *p++ = ch;
2328 q += 4;
2329 continue;
2330 utf32Error:
2331 outpos = p-PyUnicode_AS_UNICODE(unicode);
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002335 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002336 &unicode, &outpos, &p))
2337 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002338 }
2339
2340 if (byteorder)
2341 *byteorder = bo;
2342
2343 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002344 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002345
2346 /* Adjust length */
2347 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2348 goto onError;
2349
2350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
2352 return (PyObject *)unicode;
2353
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002354 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002355 Py_DECREF(unicode);
2356 Py_XDECREF(errorHandler);
2357 Py_XDECREF(exc);
2358 return NULL;
2359}
2360
2361PyObject *
2362PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002363 Py_ssize_t size,
2364 const char *errors,
2365 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002366{
2367 PyObject *v;
2368 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002369 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002370#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002371 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002372#else
2373 const int pairs = 0;
2374#endif
2375 /* Offsets from p for storing byte pairs in the right order. */
2376#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2377 int iorder[] = {0, 1, 2, 3};
2378#else
2379 int iorder[] = {3, 2, 1, 0};
2380#endif
2381
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002382#define STORECHAR(CH) \
2383 do { \
2384 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2385 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2386 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2387 p[iorder[0]] = (CH) & 0xff; \
2388 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002389 } while(0)
2390
2391 /* In narrow builds we can output surrogate pairs as one codepoint,
2392 so we need less space. */
2393#ifndef Py_UNICODE_WIDE
2394 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002395 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2396 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2397 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002398#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002399 nsize = (size - pairs + (byteorder == 0));
2400 bytesize = nsize * 4;
2401 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002402 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002403 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002404 if (v == NULL)
2405 return NULL;
2406
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002407 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002408 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002409 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002410 if (size == 0)
2411 return v;
2412
2413 if (byteorder == -1) {
2414 /* force LE */
2415 iorder[0] = 0;
2416 iorder[1] = 1;
2417 iorder[2] = 2;
2418 iorder[3] = 3;
2419 }
2420 else if (byteorder == 1) {
2421 /* force BE */
2422 iorder[0] = 3;
2423 iorder[1] = 2;
2424 iorder[2] = 1;
2425 iorder[3] = 0;
2426 }
2427
2428 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002429 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002430#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002431 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2432 Py_UCS4 ch2 = *s;
2433 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2434 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2435 s++;
2436 size--;
2437 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002438 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002439#endif
2440 STORECHAR(ch);
2441 }
2442 return v;
2443#undef STORECHAR
2444}
2445
2446PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2447{
2448 if (!PyUnicode_Check(unicode)) {
2449 PyErr_BadArgument();
2450 return NULL;
2451 }
2452 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002453 PyUnicode_GET_SIZE(unicode),
2454 NULL,
2455 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002456}
2457
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458/* --- UTF-16 Codec ------------------------------------------------------- */
2459
Tim Peters772747b2001-08-09 22:21:55 +00002460PyObject *
2461PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002462 Py_ssize_t size,
2463 const char *errors,
2464 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465{
Walter Dörwald69652032004-09-07 20:24:22 +00002466 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2467}
2468
2469PyObject *
2470PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002471 Py_ssize_t size,
2472 const char *errors,
2473 int *byteorder,
2474 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002475{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002476 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002477 Py_ssize_t startinpos;
2478 Py_ssize_t endinpos;
2479 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 PyUnicodeObject *unicode;
2481 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002482 const unsigned char *q, *e;
2483 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002484 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002485 /* Offsets from q for retrieving byte pairs in the right order. */
2486#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2487 int ihi = 1, ilo = 0;
2488#else
2489 int ihi = 0, ilo = 1;
2490#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 PyObject *errorHandler = NULL;
2492 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493
2494 /* Note: size will always be longer than the resulting Unicode
2495 character count */
2496 unicode = _PyUnicode_New(size);
2497 if (!unicode)
2498 return NULL;
2499 if (size == 0)
2500 return (PyObject *)unicode;
2501
2502 /* Unpack UTF-16 encoded data */
2503 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002504 q = (unsigned char *)s;
2505 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506
2507 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002508 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002510 /* Check for BOM marks (U+FEFF) in the input and adjust current
2511 byte order setting accordingly. In native mode, the leading BOM
2512 mark is skipped, in all other modes, it is copied to the output
2513 stream as-is (giving a ZWNBSP character). */
2514 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002515 if (size >= 2) {
2516 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002517#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002518 if (bom == 0xFEFF) {
2519 q += 2;
2520 bo = -1;
2521 }
2522 else if (bom == 0xFFFE) {
2523 q += 2;
2524 bo = 1;
2525 }
Tim Petersced69f82003-09-16 20:30:58 +00002526#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002527 if (bom == 0xFEFF) {
2528 q += 2;
2529 bo = 1;
2530 }
2531 else if (bom == 0xFFFE) {
2532 q += 2;
2533 bo = -1;
2534 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002535#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002536 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538
Tim Peters772747b2001-08-09 22:21:55 +00002539 if (bo == -1) {
2540 /* force LE */
2541 ihi = 1;
2542 ilo = 0;
2543 }
2544 else if (bo == 1) {
2545 /* force BE */
2546 ihi = 0;
2547 ilo = 1;
2548 }
2549
2550 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002551 Py_UNICODE ch;
2552 /* remaining bytes at the end? (size should be even) */
2553 if (e-q<2) {
2554 if (consumed)
2555 break;
2556 errmsg = "truncated data";
2557 startinpos = ((const char *)q)-starts;
2558 endinpos = ((const char *)e)-starts;
2559 goto utf16Error;
2560 /* The remaining input chars are ignored if the callback
2561 chooses to skip the input */
2562 }
2563 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002564
Benjamin Peterson857ce152009-01-31 16:29:18 +00002565 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002566
2567 if (ch < 0xD800 || ch > 0xDFFF) {
2568 *p++ = ch;
2569 continue;
2570 }
2571
2572 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002573 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002574 q -= 2;
2575 if (consumed)
2576 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002577 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002578 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002579 endinpos = ((const char *)e)-starts;
2580 goto utf16Error;
2581 }
2582 if (0xD800 <= ch && ch <= 0xDBFF) {
2583 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2584 q += 2;
2585 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002586#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002587 *p++ = ch;
2588 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002589#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002590 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002591#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002592 continue;
2593 }
2594 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002595 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002596 startinpos = (((const char *)q)-4)-starts;
2597 endinpos = startinpos+2;
2598 goto utf16Error;
2599 }
2600
Benjamin Peterson857ce152009-01-31 16:29:18 +00002601 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002602 errmsg = "illegal encoding";
2603 startinpos = (((const char *)q)-2)-starts;
2604 endinpos = startinpos+2;
2605 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002606
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002607 utf16Error:
2608 outpos = p-PyUnicode_AS_UNICODE(unicode);
2609 if (unicode_decode_call_errorhandler(
2610 errors, &errorHandler,
2611 "utf16", errmsg,
2612 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2613 &unicode, &outpos, &p))
2614 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 }
2616
2617 if (byteorder)
2618 *byteorder = bo;
2619
Walter Dörwald69652032004-09-07 20:24:22 +00002620 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002621 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002622
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002624 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 goto onError;
2626
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 Py_XDECREF(errorHandler);
2628 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 return (PyObject *)unicode;
2630
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002631 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 Py_XDECREF(errorHandler);
2634 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 return NULL;
2636}
2637
Tim Peters772747b2001-08-09 22:21:55 +00002638PyObject *
2639PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002640 Py_ssize_t size,
2641 const char *errors,
2642 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643{
2644 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002645 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002646 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002647#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002648 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002649#else
2650 const int pairs = 0;
2651#endif
Tim Peters772747b2001-08-09 22:21:55 +00002652 /* Offsets from p for storing byte pairs in the right order. */
2653#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2654 int ihi = 1, ilo = 0;
2655#else
2656 int ihi = 0, ilo = 1;
2657#endif
2658
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002659#define STORECHAR(CH) \
2660 do { \
2661 p[ihi] = ((CH) >> 8) & 0xff; \
2662 p[ilo] = (CH) & 0xff; \
2663 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002664 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002666#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002667 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002668 if (s[i] >= 0x10000)
2669 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002670#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002671 /* 2 * (size + pairs + (byteorder == 0)) */
2672 if (size > PY_SSIZE_T_MAX ||
2673 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002674 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002675 nsize = size + pairs + (byteorder == 0);
2676 bytesize = nsize * 2;
2677 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002678 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002679 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 if (v == NULL)
2681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002683 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002685 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002686 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002687 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002688
2689 if (byteorder == -1) {
2690 /* force LE */
2691 ihi = 1;
2692 ilo = 0;
2693 }
2694 else if (byteorder == 1) {
2695 /* force BE */
2696 ihi = 0;
2697 ilo = 1;
2698 }
2699
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002700 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002701 Py_UNICODE ch = *s++;
2702 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002703#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002704 if (ch >= 0x10000) {
2705 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2706 ch = 0xD800 | ((ch-0x10000) >> 10);
2707 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002708#endif
Tim Peters772747b2001-08-09 22:21:55 +00002709 STORECHAR(ch);
2710 if (ch2)
2711 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002714#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715}
2716
2717PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2718{
2719 if (!PyUnicode_Check(unicode)) {
2720 PyErr_BadArgument();
2721 return NULL;
2722 }
2723 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002724 PyUnicode_GET_SIZE(unicode),
2725 NULL,
2726 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727}
2728
2729/* --- Unicode Escape Codec ----------------------------------------------- */
2730
Fredrik Lundh06d12682001-01-24 07:59:11 +00002731static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002732
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002734 Py_ssize_t size,
2735 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002738 Py_ssize_t startinpos;
2739 Py_ssize_t endinpos;
2740 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002744 char* message;
2745 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 PyObject *errorHandler = NULL;
2747 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002748
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 /* Escaped strings will always be longer than the resulting
2750 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 length after conversion to the true value.
2752 (but if the error callback returns a long replacement string
2753 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754 v = _PyUnicode_New(size);
2755 if (v == NULL)
2756 goto onError;
2757 if (size == 0)
2758 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002762
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 while (s < end) {
2764 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002765 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767
2768 /* Non-escape characters are interpreted as Unicode ordinals */
2769 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002770 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771 continue;
2772 }
2773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002774 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 /* \ - Escapes */
2776 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002777 c = *s++;
2778 if (s > end)
2779 c = '\0'; /* Invalid after \ */
2780 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002782 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 case '\n': break;
2784 case '\\': *p++ = '\\'; break;
2785 case '\'': *p++ = '\''; break;
2786 case '\"': *p++ = '\"'; break;
2787 case 'b': *p++ = '\b'; break;
2788 case 'f': *p++ = '\014'; break; /* FF */
2789 case 't': *p++ = '\t'; break;
2790 case 'n': *p++ = '\n'; break;
2791 case 'r': *p++ = '\r'; break;
2792 case 'v': *p++ = '\013'; break; /* VT */
2793 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2794
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002795 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796 case '0': case '1': case '2': case '3':
2797 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002798 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002799 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002800 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002801 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002802 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002804 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 break;
2806
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002807 /* hex escapes */
2808 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002810 digits = 2;
2811 message = "truncated \\xXX escape";
2812 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002814 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002816 digits = 4;
2817 message = "truncated \\uXXXX escape";
2818 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002820 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002821 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002822 digits = 8;
2823 message = "truncated \\UXXXXXXXX escape";
2824 hexescape:
2825 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002826 if (end - s < digits) {
2827 /* count only hex digits */
2828 for (; s < end; ++s) {
2829 c = (unsigned char)*s;
2830 if (!Py_ISXDIGIT(c))
2831 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002832 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002833 goto error;
2834 }
2835 for (; digits--; ++s) {
2836 c = (unsigned char)*s;
2837 if (!Py_ISXDIGIT(c))
2838 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002839 chr = (chr<<4) & ~0xF;
2840 if (c >= '0' && c <= '9')
2841 chr += c - '0';
2842 else if (c >= 'a' && c <= 'f')
2843 chr += 10 + c - 'a';
2844 else
2845 chr += 10 + c - 'A';
2846 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002847 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002848 /* _decoding_error will have already written into the
2849 target buffer. */
2850 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002851 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002852 /* when we get here, chr is a 32-bit unicode character */
2853 if (chr <= 0xffff)
2854 /* UCS-2 character */
2855 *p++ = (Py_UNICODE) chr;
2856 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002857 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002858 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002859#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002860 *p++ = chr;
2861#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002862 chr -= 0x10000L;
2863 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002864 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002865#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002866 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002867 message = "illegal Unicode character";
2868 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002869 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002870 break;
2871
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002872 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002873 case 'N':
2874 message = "malformed \\N character escape";
2875 if (ucnhash_CAPI == NULL) {
2876 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002877 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002878 if (ucnhash_CAPI == NULL)
2879 goto ucnhashError;
2880 }
2881 if (*s == '{') {
2882 const char *start = s+1;
2883 /* look for the closing brace */
2884 while (*s != '}' && s < end)
2885 s++;
2886 if (s > start && s < end && *s == '}') {
2887 /* found a name. look it up in the unicode database */
2888 message = "unknown Unicode character name";
2889 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002890 if (s - start - 1 <= INT_MAX &&
2891 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002892 goto store;
2893 }
2894 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002895 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002896
2897 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002898 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002899 message = "\\ at end of string";
2900 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002901 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002902 }
2903 else {
2904 *p++ = '\\';
2905 *p++ = (unsigned char)s[-1];
2906 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002907 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002909 continue;
2910
2911 error:
2912 endinpos = s-starts;
2913 outpos = p-PyUnicode_AS_UNICODE(v);
2914 if (unicode_decode_call_errorhandler(
2915 errors, &errorHandler,
2916 "unicodeescape", message,
2917 starts, size, &startinpos, &endinpos, &exc, &s,
2918 &v, &outpos, &p))
2919 goto onError;
2920 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002922 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002923 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002924 Py_XDECREF(errorHandler);
2925 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002927
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002928 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002929 PyErr_SetString(
2930 PyExc_UnicodeError,
2931 "\\N escapes not supported (can't load unicodedata module)"
2932 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002933 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002934 Py_XDECREF(errorHandler);
2935 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002936 return NULL;
2937
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002938 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 Py_XDECREF(errorHandler);
2941 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942 return NULL;
2943}
2944
2945/* Return a Unicode-Escape string version of the Unicode object.
2946
2947 If quotes is true, the string is enclosed in u"" or u'' quotes as
2948 appropriate.
2949
2950*/
2951
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002952Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002953 Py_ssize_t size,
2954 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002955{
2956 /* like wcschr, but doesn't stop at NULL characters */
2957
2958 while (size-- > 0) {
2959 if (*s == ch)
2960 return s;
2961 s++;
2962 }
2963
2964 return NULL;
2965}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002966
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967static
2968PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002969 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 int quotes)
2971{
2972 PyObject *repr;
2973 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002975 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002976#ifdef Py_UNICODE_WIDE
2977 const Py_ssize_t expandsize = 10;
2978#else
2979 const Py_ssize_t expandsize = 6;
2980#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981
Neal Norwitz17753ec2006-08-21 22:21:19 +00002982 /* XXX(nnorwitz): rather than over-allocating, it would be
2983 better to choose a different scheme. Perhaps scan the
2984 first N-chars of the string and allocate based on that size.
2985 */
2986 /* Initial allocation is based on the longest-possible unichr
2987 escape.
2988
2989 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2990 unichr, so in this case it's the longest unichr escape. In
2991 narrow (UTF-16) builds this is five chars per source unichr
2992 since there are two unichrs in the surrogate pair, so in narrow
2993 (UTF-16) builds it's not the longest unichr escape.
2994
2995 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2996 so in the narrow (UTF-16) build case it's the longest unichr
2997 escape.
2998 */
2999
Neal Norwitze7d8be82008-07-31 17:17:14 +00003000 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003001 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003002
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003003 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003004 2
3005 + expandsize*size
3006 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007 if (repr == NULL)
3008 return NULL;
3009
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003010 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011
3012 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003014 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 !findchar(s, size, '"')) ? '"' : '\'';
3016 }
3017 while (size-- > 0) {
3018 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003019
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003020 /* Escape quotes and backslashes */
3021 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003022 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 *p++ = '\\';
3024 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003025 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003026 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003027
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003028#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003029 /* Map 21-bit characters to '\U00xxxxxx' */
3030 else if (ch >= 0x10000) {
3031 *p++ = '\\';
3032 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003033 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3034 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3035 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3036 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3037 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3038 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3039 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003040 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003041 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003042 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003043#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003044 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3045 else if (ch >= 0xD800 && ch < 0xDC00) {
3046 Py_UNICODE ch2;
3047 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003048
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003049 ch2 = *s++;
3050 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003051 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003052 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3053 *p++ = '\\';
3054 *p++ = 'U';
3055 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3056 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3057 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3058 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3059 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3060 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3061 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3062 *p++ = hexdigit[ucs & 0x0000000F];
3063 continue;
3064 }
3065 /* Fall through: isolated surrogates are copied as-is */
3066 s--;
3067 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003068 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003069#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003070
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003072 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 *p++ = '\\';
3074 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003075 *p++ = hexdigit[(ch >> 12) & 0x000F];
3076 *p++ = hexdigit[(ch >> 8) & 0x000F];
3077 *p++ = hexdigit[(ch >> 4) & 0x000F];
3078 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003080
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003081 /* Map special whitespace to '\t', \n', '\r' */
3082 else if (ch == '\t') {
3083 *p++ = '\\';
3084 *p++ = 't';
3085 }
3086 else if (ch == '\n') {
3087 *p++ = '\\';
3088 *p++ = 'n';
3089 }
3090 else if (ch == '\r') {
3091 *p++ = '\\';
3092 *p++ = 'r';
3093 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003094
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003095 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003096 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003098 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003099 *p++ = hexdigit[(ch >> 4) & 0x000F];
3100 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003101 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003102
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 /* Copy everything else as-is */
3104 else
3105 *p++ = (char) ch;
3106 }
3107 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003108 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109
3110 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003111 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 return repr;
3114}
3115
3116PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003117 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118{
3119 return unicodeescape_string(s, size, 0);
3120}
3121
3122PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3123{
3124 if (!PyUnicode_Check(unicode)) {
3125 PyErr_BadArgument();
3126 return NULL;
3127 }
3128 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003129 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130}
3131
3132/* --- Raw Unicode Escape Codec ------------------------------------------- */
3133
3134PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003135 Py_ssize_t size,
3136 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003139 Py_ssize_t startinpos;
3140 Py_ssize_t endinpos;
3141 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003142 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003143 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144 const char *end;
3145 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003146 PyObject *errorHandler = NULL;
3147 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003148
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149 /* Escaped strings will always be longer than the resulting
3150 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151 length after conversion to the true value. (But decoding error
3152 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153 v = _PyUnicode_New(size);
3154 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003155 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003157 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 end = s + size;
3160 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003161 unsigned char c;
3162 Py_UCS4 x;
3163 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003164 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003166 /* Non-escape characters are interpreted as Unicode ordinals */
3167 if (*s != '\\') {
3168 *p++ = (unsigned char)*s++;
3169 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003170 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003171 startinpos = s-starts;
3172
3173 /* \u-escapes are only interpreted iff the number of leading
3174 backslashes if odd */
3175 bs = s;
3176 for (;s < end;) {
3177 if (*s != '\\')
3178 break;
3179 *p++ = (unsigned char)*s++;
3180 }
3181 if (((s - bs) & 1) == 0 ||
3182 s >= end ||
3183 (*s != 'u' && *s != 'U')) {
3184 continue;
3185 }
3186 p--;
3187 count = *s=='u' ? 4 : 8;
3188 s++;
3189
3190 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3191 outpos = p-PyUnicode_AS_UNICODE(v);
3192 for (x = 0, i = 0; i < count; ++i, ++s) {
3193 c = (unsigned char)*s;
3194 if (!isxdigit(c)) {
3195 endinpos = s-starts;
3196 if (unicode_decode_call_errorhandler(
3197 errors, &errorHandler,
3198 "rawunicodeescape", "truncated \\uXXXX",
3199 starts, size, &startinpos, &endinpos, &exc, &s,
3200 &v, &outpos, &p))
3201 goto onError;
3202 goto nextByte;
3203 }
3204 x = (x<<4) & ~0xF;
3205 if (c >= '0' && c <= '9')
3206 x += c - '0';
3207 else if (c >= 'a' && c <= 'f')
3208 x += 10 + c - 'a';
3209 else
3210 x += 10 + c - 'A';
3211 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003212 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003213 /* UCS-2 character */
3214 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003215 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003216 /* UCS-4 character. Either store directly, or as
3217 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003218#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003219 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003220#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003221 x -= 0x10000L;
3222 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3223 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003224#endif
3225 } else {
3226 endinpos = s-starts;
3227 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003228 if (unicode_decode_call_errorhandler(
3229 errors, &errorHandler,
3230 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003231 starts, size, &startinpos, &endinpos, &exc, &s,
3232 &v, &outpos, &p))
3233 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003234 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003235 nextByte:
3236 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003238 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003239 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 Py_XDECREF(errorHandler);
3241 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003243
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003244 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246 Py_XDECREF(errorHandler);
3247 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 return NULL;
3249}
3250
3251PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003252 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253{
3254 PyObject *repr;
3255 char *p;
3256 char *q;
3257
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003258 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003259#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003260 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003261#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003262 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003263#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003264
Neal Norwitze7d8be82008-07-31 17:17:14 +00003265 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003266 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003267
Neal Norwitze7d8be82008-07-31 17:17:14 +00003268 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 if (repr == NULL)
3270 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003271 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003272 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003274 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 while (size-- > 0) {
3276 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003277#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003278 /* Map 32-bit characters to '\Uxxxxxxxx' */
3279 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003280 *p++ = '\\';
3281 *p++ = 'U';
3282 *p++ = hexdigit[(ch >> 28) & 0xf];
3283 *p++ = hexdigit[(ch >> 24) & 0xf];
3284 *p++ = hexdigit[(ch >> 20) & 0xf];
3285 *p++ = hexdigit[(ch >> 16) & 0xf];
3286 *p++ = hexdigit[(ch >> 12) & 0xf];
3287 *p++ = hexdigit[(ch >> 8) & 0xf];
3288 *p++ = hexdigit[(ch >> 4) & 0xf];
3289 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003290 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003291 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003292#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003293 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3294 if (ch >= 0xD800 && ch < 0xDC00) {
3295 Py_UNICODE ch2;
3296 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003298 ch2 = *s++;
3299 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003300 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003301 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3302 *p++ = '\\';
3303 *p++ = 'U';
3304 *p++ = hexdigit[(ucs >> 28) & 0xf];
3305 *p++ = hexdigit[(ucs >> 24) & 0xf];
3306 *p++ = hexdigit[(ucs >> 20) & 0xf];
3307 *p++ = hexdigit[(ucs >> 16) & 0xf];
3308 *p++ = hexdigit[(ucs >> 12) & 0xf];
3309 *p++ = hexdigit[(ucs >> 8) & 0xf];
3310 *p++ = hexdigit[(ucs >> 4) & 0xf];
3311 *p++ = hexdigit[ucs & 0xf];
3312 continue;
3313 }
3314 /* Fall through: isolated surrogates are copied as-is */
3315 s--;
3316 size++;
3317 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003318#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003319 /* Map 16-bit characters to '\uxxxx' */
3320 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 *p++ = '\\';
3322 *p++ = 'u';
3323 *p++ = hexdigit[(ch >> 12) & 0xf];
3324 *p++ = hexdigit[(ch >> 8) & 0xf];
3325 *p++ = hexdigit[(ch >> 4) & 0xf];
3326 *p++ = hexdigit[ch & 15];
3327 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003328 /* Copy everything else as-is */
3329 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 *p++ = (char) ch;
3331 }
3332 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003333 if (_PyString_Resize(&repr, p - q))
3334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 return repr;
3336}
3337
3338PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3339{
3340 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003341 PyErr_BadArgument();
3342 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 }
3344 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003345 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346}
3347
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003348/* --- Unicode Internal Codec ------------------------------------------- */
3349
3350PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003351 Py_ssize_t size,
3352 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003353{
3354 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003355 Py_ssize_t startinpos;
3356 Py_ssize_t endinpos;
3357 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003358 PyUnicodeObject *v;
3359 Py_UNICODE *p;
3360 const char *end;
3361 const char *reason;
3362 PyObject *errorHandler = NULL;
3363 PyObject *exc = NULL;
3364
Neal Norwitzd43069c2006-01-08 01:12:10 +00003365#ifdef Py_UNICODE_WIDE
3366 Py_UNICODE unimax = PyUnicode_GetMax();
3367#endif
3368
Armin Rigo7ccbca92006-10-04 12:17:45 +00003369 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003370 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3371 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003372 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003373 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003374 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003375 p = PyUnicode_AS_UNICODE(v);
3376 end = s + size;
3377
3378 while (s < end) {
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003379 if (end-s < Py_UNICODE_SIZE) {
3380 endinpos = end-starts;
3381 reason = "truncated input";
3382 goto error;
3383 }
Neal Norwitz1004a532006-05-15 07:17:23 +00003384 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003385#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003386 /* We have to sanity check the raw data, otherwise doom looms for
3387 some malformed UCS-4 data. */
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003388 if (*p > unimax || *p < 0) {
3389 endinpos = s - starts + Py_UNICODE_SIZE;
3390 reason = "illegal code point (> 0x10FFFF)";
3391 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003392 }
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003393#endif
3394 p++;
3395 s += Py_UNICODE_SIZE;
3396 continue;
3397
3398 error:
3399 startinpos = s - starts;
3400 outpos = p - PyUnicode_AS_UNICODE(v);
3401 if (unicode_decode_call_errorhandler(
3402 errors, &errorHandler,
3403 "unicode_internal", reason,
3404 starts, size, &startinpos, &endinpos, &exc, &s,
3405 &v, &outpos, &p)) {
3406 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003407 }
3408 }
3409
Martin v. Löwis412fb672006-04-13 06:34:32 +00003410 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003411 goto onError;
3412 Py_XDECREF(errorHandler);
3413 Py_XDECREF(exc);
3414 return (PyObject *)v;
3415
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003416 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003417 Py_XDECREF(v);
3418 Py_XDECREF(errorHandler);
3419 Py_XDECREF(exc);
3420 return NULL;
3421}
3422
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423/* --- Latin-1 Codec ------------------------------------------------------ */
3424
3425PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003426 Py_ssize_t size,
3427 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428{
3429 PyUnicodeObject *v;
3430 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003431
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003433 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003434 Py_UNICODE r = *(unsigned char*)s;
3435 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003436 }
3437
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438 v = _PyUnicode_New(size);
3439 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003440 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003442 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443 p = PyUnicode_AS_UNICODE(v);
3444 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003445 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003447
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003448 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 Py_XDECREF(v);
3450 return NULL;
3451}
3452
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453/* create or adjust a UnicodeEncodeError */
3454static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003455 const char *encoding,
3456 const Py_UNICODE *unicode, Py_ssize_t size,
3457 Py_ssize_t startpos, Py_ssize_t endpos,
3458 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003461 *exceptionObject = PyUnicodeEncodeError_Create(
3462 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 }
3464 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003465 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3466 goto onError;
3467 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3468 goto onError;
3469 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3470 goto onError;
3471 return;
3472 onError:
3473 Py_DECREF(*exceptionObject);
3474 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 }
3476}
3477
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478/* raises a UnicodeEncodeError */
3479static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003480 const char *encoding,
3481 const Py_UNICODE *unicode, Py_ssize_t size,
3482 Py_ssize_t startpos, Py_ssize_t endpos,
3483 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484{
3485 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003486 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003488 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489}
3490
3491/* error handling callback helper:
3492 build arguments, call the callback and check the arguments,
3493 put the result into newpos and return the replacement string, which
3494 has to be freed by the caller */
3495static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003496 PyObject **errorHandler,
3497 const char *encoding, const char *reason,
3498 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3499 Py_ssize_t startpos, Py_ssize_t endpos,
3500 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003502 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503
3504 PyObject *restuple;
3505 PyObject *resunicode;
3506
3507 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003508 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003510 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511 }
3512
3513 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003514 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003516 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517
3518 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003519 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003521 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003523 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003524 Py_DECREF(restuple);
3525 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 }
3527 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003528 &resunicode, newpos)) {
3529 Py_DECREF(restuple);
3530 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 }
3532 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003533 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003534 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003535 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3536 Py_DECREF(restuple);
3537 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003538 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 Py_INCREF(resunicode);
3540 Py_DECREF(restuple);
3541 return resunicode;
3542}
3543
3544static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003545 Py_ssize_t size,
3546 const char *errors,
3547 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548{
3549 /* output object */
3550 PyObject *res;
3551 /* pointers to the beginning and end+1 of input */
3552 const Py_UNICODE *startp = p;
3553 const Py_UNICODE *endp = p + size;
3554 /* pointer to the beginning of the unencodable characters */
3555 /* const Py_UNICODE *badp = NULL; */
3556 /* pointer into the output */
3557 char *str;
3558 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003559 Py_ssize_t respos = 0;
3560 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003561 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3562 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 PyObject *errorHandler = NULL;
3564 PyObject *exc = NULL;
3565 /* the following variable is used for caching string comparisons
3566 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3567 int known_errorHandler = -1;
3568
3569 /* allocate enough for a simple encoding without
3570 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003571 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 if (res == NULL)
3573 goto onError;
3574 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003575 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003576 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 ressize = size;
3578
3579 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003580 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003582 /* can we encode this? */
3583 if (c<limit) {
3584 /* no overflow check, because we know that the space is enough */
3585 *str++ = (char)c;
3586 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003587 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003588 else {
3589 Py_ssize_t unicodepos = p-startp;
3590 Py_ssize_t requiredsize;
3591 PyObject *repunicode;
3592 Py_ssize_t repsize;
3593 Py_ssize_t newpos;
3594 Py_ssize_t respos;
3595 Py_UNICODE *uni2;
3596 /* startpos for collecting unencodable chars */
3597 const Py_UNICODE *collstart = p;
3598 const Py_UNICODE *collend = p;
3599 /* find all unecodable characters */
3600 while ((collend < endp) && ((*collend)>=limit))
3601 ++collend;
3602 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3603 if (known_errorHandler==-1) {
3604 if ((errors==NULL) || (!strcmp(errors, "strict")))
3605 known_errorHandler = 1;
3606 else if (!strcmp(errors, "replace"))
3607 known_errorHandler = 2;
3608 else if (!strcmp(errors, "ignore"))
3609 known_errorHandler = 3;
3610 else if (!strcmp(errors, "xmlcharrefreplace"))
3611 known_errorHandler = 4;
3612 else
3613 known_errorHandler = 0;
3614 }
3615 switch (known_errorHandler) {
3616 case 1: /* strict */
3617 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3618 goto onError;
3619 case 2: /* replace */
3620 while (collstart++<collend)
3621 *str++ = '?'; /* fall through */
3622 case 3: /* ignore */
3623 p = collend;
3624 break;
3625 case 4: /* xmlcharrefreplace */
3626 respos = str-PyString_AS_STRING(res);
3627 /* determine replacement size (temporarily (mis)uses p) */
3628 for (p = collstart, repsize = 0; p < collend; ++p) {
3629 if (*p<10)
3630 repsize += 2+1+1;
3631 else if (*p<100)
3632 repsize += 2+2+1;
3633 else if (*p<1000)
3634 repsize += 2+3+1;
3635 else if (*p<10000)
3636 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003637#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003638 else
3639 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003640#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003641 else if (*p<100000)
3642 repsize += 2+5+1;
3643 else if (*p<1000000)
3644 repsize += 2+6+1;
3645 else
3646 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003647#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003648 }
3649 requiredsize = respos+repsize+(endp-collend);
3650 if (requiredsize > ressize) {
3651 if (requiredsize<2*ressize)
3652 requiredsize = 2*ressize;
3653 if (_PyString_Resize(&res, requiredsize))
3654 goto onError;
3655 str = PyString_AS_STRING(res) + respos;
3656 ressize = requiredsize;
3657 }
3658 /* generate replacement (temporarily (mis)uses p) */
3659 for (p = collstart; p < collend; ++p) {
3660 str += sprintf(str, "&#%d;", (int)*p);
3661 }
3662 p = collend;
3663 break;
3664 default:
3665 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3666 encoding, reason, startp, size, &exc,
3667 collstart-startp, collend-startp, &newpos);
3668 if (repunicode == NULL)
3669 goto onError;
3670 /* need more space? (at least enough for what we have+the
3671 replacement+the rest of the string, so we won't have to
3672 check space for encodable characters) */
3673 respos = str-PyString_AS_STRING(res);
3674 repsize = PyUnicode_GET_SIZE(repunicode);
3675 requiredsize = respos+repsize+(endp-collend);
3676 if (requiredsize > ressize) {
3677 if (requiredsize<2*ressize)
3678 requiredsize = 2*ressize;
3679 if (_PyString_Resize(&res, requiredsize)) {
3680 Py_DECREF(repunicode);
3681 goto onError;
3682 }
3683 str = PyString_AS_STRING(res) + respos;
3684 ressize = requiredsize;
3685 }
3686 /* check if there is anything unencodable in the replacement
3687 and copy it to the output */
3688 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3689 c = *uni2;
3690 if (c >= limit) {
3691 raise_encode_exception(&exc, encoding, startp, size,
3692 unicodepos, unicodepos+1, reason);
3693 Py_DECREF(repunicode);
3694 goto onError;
3695 }
3696 *str = (char)c;
3697 }
3698 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003699 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003700 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003701 }
3702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003704 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003706 /* If this falls res will be NULL */
3707 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 Py_XDECREF(errorHandler);
3709 Py_XDECREF(exc);
3710 return res;
3711
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003712 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 Py_XDECREF(res);
3714 Py_XDECREF(errorHandler);
3715 Py_XDECREF(exc);
3716 return NULL;
3717}
3718
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003720 Py_ssize_t size,
3721 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003723 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724}
3725
3726PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3727{
3728 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003729 PyErr_BadArgument();
3730 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 }
3732 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003733 PyUnicode_GET_SIZE(unicode),
3734 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735}
3736
3737/* --- 7-bit ASCII Codec -------------------------------------------------- */
3738
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003740 Py_ssize_t size,
3741 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 PyUnicodeObject *v;
3745 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003746 Py_ssize_t startinpos;
3747 Py_ssize_t endinpos;
3748 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 const char *e;
3750 PyObject *errorHandler = NULL;
3751 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003752
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003754 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003755 Py_UNICODE r = *(unsigned char*)s;
3756 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003757 }
Tim Petersced69f82003-09-16 20:30:58 +00003758
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759 v = _PyUnicode_New(size);
3760 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003761 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003763 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 e = s + size;
3766 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003767 register unsigned char c = (unsigned char)*s;
3768 if (c < 128) {
3769 *p++ = c;
3770 ++s;
3771 }
3772 else {
3773 startinpos = s-starts;
3774 endinpos = startinpos + 1;
3775 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3776 if (unicode_decode_call_errorhandler(
3777 errors, &errorHandler,
3778 "ascii", "ordinal not in range(128)",
3779 starts, size, &startinpos, &endinpos, &exc, &s,
3780 &v, &outpos, &p))
3781 goto onError;
3782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003784 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003785 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3786 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 Py_XDECREF(errorHandler);
3788 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003790
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003791 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793 Py_XDECREF(errorHandler);
3794 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 return NULL;
3796}
3797
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003799 Py_ssize_t size,
3800 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003802 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803}
3804
3805PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3806{
3807 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003808 PyErr_BadArgument();
3809 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 }
3811 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003812 PyUnicode_GET_SIZE(unicode),
3813 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814}
3815
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003816#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003817
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003818/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003819
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003820#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003821#define NEED_RETRY
3822#endif
3823
3824/* XXX This code is limited to "true" double-byte encodings, as
3825 a) it assumes an incomplete character consists of a single byte, and
3826 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003827 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003828
3829static int is_dbcs_lead_byte(const char *s, int offset)
3830{
3831 const char *curr = s + offset;
3832
3833 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003834 const char *prev = CharPrev(s, curr);
3835 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003836 }
3837 return 0;
3838}
3839
3840/*
3841 * Decode MBCS string into unicode object. If 'final' is set, converts
3842 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3843 */
3844static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003845 const char *s, /* MBCS string */
3846 int size, /* sizeof MBCS string */
3847 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003848{
3849 Py_UNICODE *p;
3850 Py_ssize_t n = 0;
3851 int usize = 0;
3852
3853 assert(size >= 0);
3854
3855 /* Skip trailing lead-byte unless 'final' is set */
3856 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003857 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003858
3859 /* First get the size of the result */
3860 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003861 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3862 if (usize == 0) {
3863 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3864 return -1;
3865 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003866 }
3867
3868 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003869 /* Create unicode object */
3870 *v = _PyUnicode_New(usize);
3871 if (*v == NULL)
3872 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003873 }
3874 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003875 /* Extend unicode object */
3876 n = PyUnicode_GET_SIZE(*v);
3877 if (_PyUnicode_Resize(v, n + usize) < 0)
3878 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003879 }
3880
3881 /* Do the conversion */
3882 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003883 p = PyUnicode_AS_UNICODE(*v) + n;
3884 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3885 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3886 return -1;
3887 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003888 }
3889
3890 return size;
3891}
3892
3893PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003894 Py_ssize_t size,
3895 const char *errors,
3896 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003897{
3898 PyUnicodeObject *v = NULL;
3899 int done;
3900
3901 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003902 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003903
3904#ifdef NEED_RETRY
3905 retry:
3906 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003907 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003908 else
3909#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003910 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003911
3912 if (done < 0) {
3913 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003914 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003915 }
3916
3917 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003918 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003919
3920#ifdef NEED_RETRY
3921 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003922 s += done;
3923 size -= done;
3924 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003925 }
3926#endif
3927
3928 return (PyObject *)v;
3929}
3930
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003931PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003932 Py_ssize_t size,
3933 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003934{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003935 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3936}
3937
3938/*
3939 * Convert unicode into string object (MBCS).
3940 * Returns 0 if succeed, -1 otherwise.
3941 */
3942static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003943 const Py_UNICODE *p, /* unicode */
3944 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003945{
3946 int mbcssize = 0;
3947 Py_ssize_t n = 0;
3948
3949 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003950
3951 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003952 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003953 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3954 if (mbcssize == 0) {
3955 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3956 return -1;
3957 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003958 }
3959
Martin v. Löwisd8251432006-06-14 05:21:04 +00003960 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003961 /* Create string object */
3962 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3963 if (*repr == NULL)
3964 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003965 }
3966 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003967 /* Extend string object */
3968 n = PyString_Size(*repr);
3969 if (_PyString_Resize(repr, n + mbcssize) < 0)
3970 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003971 }
3972
3973 /* Do the conversion */
3974 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003975 char *s = PyString_AS_STRING(*repr) + n;
3976 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3977 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3978 return -1;
3979 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003980 }
3981
3982 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003983}
3984
3985PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003986 Py_ssize_t size,
3987 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003988{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003989 PyObject *repr = NULL;
3990 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003991
Martin v. Löwisd8251432006-06-14 05:21:04 +00003992#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003993 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00003994 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003995 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003996 else
3997#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003998 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003999
Martin v. Löwisd8251432006-06-14 05:21:04 +00004000 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004001 Py_XDECREF(repr);
4002 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004003 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004004
4005#ifdef NEED_RETRY
4006 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004007 p += INT_MAX;
4008 size -= INT_MAX;
4009 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004010 }
4011#endif
4012
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004013 return repr;
4014}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004015
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004016PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4017{
4018 if (!PyUnicode_Check(unicode)) {
4019 PyErr_BadArgument();
4020 return NULL;
4021 }
4022 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004023 PyUnicode_GET_SIZE(unicode),
4024 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004025}
4026
Martin v. Löwisd8251432006-06-14 05:21:04 +00004027#undef NEED_RETRY
4028
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004029#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004030
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031/* --- Character Mapping Codec -------------------------------------------- */
4032
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004034 Py_ssize_t size,
4035 PyObject *mapping,
4036 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004039 Py_ssize_t startinpos;
4040 Py_ssize_t endinpos;
4041 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004043 PyUnicodeObject *v;
4044 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004045 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046 PyObject *errorHandler = NULL;
4047 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004048 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004049 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004050
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 /* Default to Latin-1 */
4052 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004053 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054
4055 v = _PyUnicode_New(size);
4056 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004057 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004059 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004062 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004063 mapstring = PyUnicode_AS_UNICODE(mapping);
4064 maplen = PyUnicode_GET_SIZE(mapping);
4065 while (s < e) {
4066 unsigned char ch = *s;
4067 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004069 if (ch < maplen)
4070 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004072 if (x == 0xfffe) {
4073 /* undefined mapping */
4074 outpos = p-PyUnicode_AS_UNICODE(v);
4075 startinpos = s-starts;
4076 endinpos = startinpos+1;
4077 if (unicode_decode_call_errorhandler(
4078 errors, &errorHandler,
4079 "charmap", "character maps to <undefined>",
4080 starts, size, &startinpos, &endinpos, &exc, &s,
4081 &v, &outpos, &p)) {
4082 goto onError;
4083 }
4084 continue;
4085 }
4086 *p++ = x;
4087 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004088 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004089 }
4090 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004091 while (s < e) {
4092 unsigned char ch = *s;
4093 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004094
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004095 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4096 w = PyInt_FromLong((long)ch);
4097 if (w == NULL)
4098 goto onError;
4099 x = PyObject_GetItem(mapping, w);
4100 Py_DECREF(w);
4101 if (x == NULL) {
4102 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4103 /* No mapping found means: mapping is undefined. */
4104 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004105 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004106 } else
4107 goto onError;
4108 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004109
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004110 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004111 if (x == Py_None)
4112 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004113 if (PyInt_Check(x)) {
4114 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004115 if (value == 0xFFFE)
4116 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004117 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004118 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004119 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004120 Py_DECREF(x);
4121 goto onError;
4122 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004123
4124#ifndef Py_UNICODE_WIDE
4125 if (value > 0xFFFF) {
4126 /* see the code for 1-n mapping below */
4127 if (extrachars < 2) {
4128 /* resize first */
4129 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4130 Py_ssize_t needed = 10 - extrachars;
4131 extrachars += needed;
4132 /* XXX overflow detection missing */
4133 if (_PyUnicode_Resize(&v,
4134 PyUnicode_GET_SIZE(v) + needed) < 0) {
4135 Py_DECREF(x);
4136 goto onError;
4137 }
4138 p = PyUnicode_AS_UNICODE(v) + oldpos;
4139 }
4140 value -= 0x10000;
4141 *p++ = 0xD800 | (value >> 10);
4142 *p++ = 0xDC00 | (value & 0x3FF);
4143 extrachars -= 2;
4144 }
4145 else
4146#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004147 *p++ = (Py_UNICODE)value;
4148 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004149 else if (PyUnicode_Check(x)) {
4150 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004151
Serhiy Storchaka95997452013-01-15 14:42:59 +02004152 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004153 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004154 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4155 if (value == 0xFFFE)
4156 goto Undefined;
4157 *p++ = value;
4158 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004159 else if (targetsize > 1) {
4160 /* 1-n mapping */
4161 if (targetsize > extrachars) {
4162 /* resize first */
4163 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4164 Py_ssize_t needed = (targetsize - extrachars) + \
4165 (targetsize << 2);
4166 extrachars += needed;
4167 /* XXX overflow detection missing */
4168 if (_PyUnicode_Resize(&v,
4169 PyUnicode_GET_SIZE(v) + needed) < 0) {
4170 Py_DECREF(x);
4171 goto onError;
4172 }
4173 p = PyUnicode_AS_UNICODE(v) + oldpos;
4174 }
4175 Py_UNICODE_COPY(p,
4176 PyUnicode_AS_UNICODE(x),
4177 targetsize);
4178 p += targetsize;
4179 extrachars -= targetsize;
4180 }
4181 /* 1-0 mapping: skip the character */
4182 }
4183 else {
4184 /* wrong return value */
4185 PyErr_SetString(PyExc_TypeError,
4186 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004187 Py_DECREF(x);
4188 goto onError;
4189 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004190 Py_DECREF(x);
4191 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004192 continue;
4193Undefined:
4194 /* undefined mapping */
4195 Py_XDECREF(x);
4196 outpos = p-PyUnicode_AS_UNICODE(v);
4197 startinpos = s-starts;
4198 endinpos = startinpos+1;
4199 if (unicode_decode_call_errorhandler(
4200 errors, &errorHandler,
4201 "charmap", "character maps to <undefined>",
4202 starts, size, &startinpos, &endinpos, &exc, &s,
4203 &v, &outpos, &p)) {
4204 goto onError;
4205 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207 }
4208 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004209 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4210 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 Py_XDECREF(errorHandler);
4212 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004214
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004215 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216 Py_XDECREF(errorHandler);
4217 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218 Py_XDECREF(v);
4219 return NULL;
4220}
4221
Martin v. Löwis3f767792006-06-04 19:36:28 +00004222/* Charmap encoding: the lookup table */
4223
4224struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004225 PyObject_HEAD
4226 unsigned char level1[32];
4227 int count2, count3;
4228 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004229};
4230
4231static PyObject*
4232encoding_map_size(PyObject *obj, PyObject* args)
4233{
4234 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004235 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004236 128*map->count3);
4237}
4238
4239static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004240 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004241 PyDoc_STR("Return the size (in bytes) of this object") },
4242 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004243};
4244
4245static void
4246encoding_map_dealloc(PyObject* o)
4247{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004248 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004249}
4250
4251static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004252 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004253 "EncodingMap", /*tp_name*/
4254 sizeof(struct encoding_map), /*tp_basicsize*/
4255 0, /*tp_itemsize*/
4256 /* methods */
4257 encoding_map_dealloc, /*tp_dealloc*/
4258 0, /*tp_print*/
4259 0, /*tp_getattr*/
4260 0, /*tp_setattr*/
4261 0, /*tp_compare*/
4262 0, /*tp_repr*/
4263 0, /*tp_as_number*/
4264 0, /*tp_as_sequence*/
4265 0, /*tp_as_mapping*/
4266 0, /*tp_hash*/
4267 0, /*tp_call*/
4268 0, /*tp_str*/
4269 0, /*tp_getattro*/
4270 0, /*tp_setattro*/
4271 0, /*tp_as_buffer*/
4272 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4273 0, /*tp_doc*/
4274 0, /*tp_traverse*/
4275 0, /*tp_clear*/
4276 0, /*tp_richcompare*/
4277 0, /*tp_weaklistoffset*/
4278 0, /*tp_iter*/
4279 0, /*tp_iternext*/
4280 encoding_map_methods, /*tp_methods*/
4281 0, /*tp_members*/
4282 0, /*tp_getset*/
4283 0, /*tp_base*/
4284 0, /*tp_dict*/
4285 0, /*tp_descr_get*/
4286 0, /*tp_descr_set*/
4287 0, /*tp_dictoffset*/
4288 0, /*tp_init*/
4289 0, /*tp_alloc*/
4290 0, /*tp_new*/
4291 0, /*tp_free*/
4292 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004293};
4294
4295PyObject*
4296PyUnicode_BuildEncodingMap(PyObject* string)
4297{
4298 Py_UNICODE *decode;
4299 PyObject *result;
4300 struct encoding_map *mresult;
4301 int i;
4302 int need_dict = 0;
4303 unsigned char level1[32];
4304 unsigned char level2[512];
4305 unsigned char *mlevel1, *mlevel2, *mlevel3;
4306 int count2 = 0, count3 = 0;
4307
4308 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4309 PyErr_BadArgument();
4310 return NULL;
4311 }
4312 decode = PyUnicode_AS_UNICODE(string);
4313 memset(level1, 0xFF, sizeof level1);
4314 memset(level2, 0xFF, sizeof level2);
4315
4316 /* If there isn't a one-to-one mapping of NULL to \0,
4317 or if there are non-BMP characters, we need to use
4318 a mapping dictionary. */
4319 if (decode[0] != 0)
4320 need_dict = 1;
4321 for (i = 1; i < 256; i++) {
4322 int l1, l2;
4323 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004324#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004325 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004326#endif
4327 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004328 need_dict = 1;
4329 break;
4330 }
4331 if (decode[i] == 0xFFFE)
4332 /* unmapped character */
4333 continue;
4334 l1 = decode[i] >> 11;
4335 l2 = decode[i] >> 7;
4336 if (level1[l1] == 0xFF)
4337 level1[l1] = count2++;
4338 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004339 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004340 }
4341
4342 if (count2 >= 0xFF || count3 >= 0xFF)
4343 need_dict = 1;
4344
4345 if (need_dict) {
4346 PyObject *result = PyDict_New();
4347 PyObject *key, *value;
4348 if (!result)
4349 return NULL;
4350 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004351 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004352 key = PyInt_FromLong(decode[i]);
4353 value = PyInt_FromLong(i);
4354 if (!key || !value)
4355 goto failed1;
4356 if (PyDict_SetItem(result, key, value) == -1)
4357 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004358 Py_DECREF(key);
4359 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004360 }
4361 return result;
4362 failed1:
4363 Py_XDECREF(key);
4364 Py_XDECREF(value);
4365 Py_DECREF(result);
4366 return NULL;
4367 }
4368
4369 /* Create a three-level trie */
4370 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4371 16*count2 + 128*count3 - 1);
4372 if (!result)
4373 return PyErr_NoMemory();
4374 PyObject_Init(result, &EncodingMapType);
4375 mresult = (struct encoding_map*)result;
4376 mresult->count2 = count2;
4377 mresult->count3 = count3;
4378 mlevel1 = mresult->level1;
4379 mlevel2 = mresult->level23;
4380 mlevel3 = mresult->level23 + 16*count2;
4381 memcpy(mlevel1, level1, 32);
4382 memset(mlevel2, 0xFF, 16*count2);
4383 memset(mlevel3, 0, 128*count3);
4384 count3 = 0;
4385 for (i = 1; i < 256; i++) {
4386 int o1, o2, o3, i2, i3;
4387 if (decode[i] == 0xFFFE)
4388 /* unmapped character */
4389 continue;
4390 o1 = decode[i]>>11;
4391 o2 = (decode[i]>>7) & 0xF;
4392 i2 = 16*mlevel1[o1] + o2;
4393 if (mlevel2[i2] == 0xFF)
4394 mlevel2[i2] = count3++;
4395 o3 = decode[i] & 0x7F;
4396 i3 = 128*mlevel2[i2] + o3;
4397 mlevel3[i3] = i;
4398 }
4399 return result;
4400}
4401
4402static int
4403encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4404{
4405 struct encoding_map *map = (struct encoding_map*)mapping;
4406 int l1 = c>>11;
4407 int l2 = (c>>7) & 0xF;
4408 int l3 = c & 0x7F;
4409 int i;
4410
4411#ifdef Py_UNICODE_WIDE
4412 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004413 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004414 }
4415#endif
4416 if (c == 0)
4417 return 0;
4418 /* level 1*/
4419 i = map->level1[l1];
4420 if (i == 0xFF) {
4421 return -1;
4422 }
4423 /* level 2*/
4424 i = map->level23[16*i+l2];
4425 if (i == 0xFF) {
4426 return -1;
4427 }
4428 /* level 3 */
4429 i = map->level23[16*map->count2 + 128*i + l3];
4430 if (i == 0) {
4431 return -1;
4432 }
4433 return i;
4434}
4435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436/* Lookup the character ch in the mapping. If the character
4437 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004438 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 PyObject *w = PyInt_FromLong((long)c);
4442 PyObject *x;
4443
4444 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004445 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 x = PyObject_GetItem(mapping, w);
4447 Py_DECREF(w);
4448 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004449 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4450 /* No mapping found means: mapping is undefined. */
4451 PyErr_Clear();
4452 x = Py_None;
4453 Py_INCREF(x);
4454 return x;
4455 } else
4456 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004458 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004459 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004461 long value = PyInt_AS_LONG(x);
4462 if (value < 0 || value > 255) {
4463 PyErr_SetString(PyExc_TypeError,
4464 "character mapping must be in range(256)");
4465 Py_DECREF(x);
4466 return NULL;
4467 }
4468 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004470 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004471 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004473 /* wrong return value */
4474 PyErr_SetString(PyExc_TypeError,
4475 "character mapping must return integer, None or str");
4476 Py_DECREF(x);
4477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 }
4479}
4480
Martin v. Löwis3f767792006-06-04 19:36:28 +00004481static int
4482charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4483{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004484 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4485 /* exponentially overallocate to minimize reallocations */
4486 if (requiredsize < 2*outsize)
4487 requiredsize = 2*outsize;
4488 if (_PyString_Resize(outobj, requiredsize)) {
4489 return 0;
4490 }
4491 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004492}
4493
Benjamin Peterson857ce152009-01-31 16:29:18 +00004494typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004495 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004496}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497/* lookup the character, put the result in the output string and adjust
4498 various state variables. Reallocate the output string if not enough
4499 space is available. Return a new reference to the object that
4500 was put in the output buffer, or Py_None, if the mapping was undefined
4501 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004502 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004504charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004505 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004507 PyObject *rep;
4508 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004509 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510
Christian Heimese93237d2007-12-19 02:37:44 +00004511 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004512 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004513 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004514 if (res == -1)
4515 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004516 if (outsize<requiredsize)
4517 if (!charmapencode_resize(outobj, outpos, requiredsize))
4518 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004519 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004520 outstart[(*outpos)++] = (char)res;
4521 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004522 }
4523
4524 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004526 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004527 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004528 Py_DECREF(rep);
4529 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004530 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004531 if (PyInt_Check(rep)) {
4532 Py_ssize_t requiredsize = *outpos+1;
4533 if (outsize<requiredsize)
4534 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4535 Py_DECREF(rep);
4536 return enc_EXCEPTION;
4537 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004538 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004539 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004540 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004541 else {
4542 const char *repchars = PyString_AS_STRING(rep);
4543 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4544 Py_ssize_t requiredsize = *outpos+repsize;
4545 if (outsize<requiredsize)
4546 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4547 Py_DECREF(rep);
4548 return enc_EXCEPTION;
4549 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004550 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004551 memcpy(outstart + *outpos, repchars, repsize);
4552 *outpos += repsize;
4553 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 }
Georg Brandl9f167602006-06-04 21:46:16 +00004555 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004556 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557}
4558
4559/* handle an error in PyUnicode_EncodeCharmap
4560 Return 0 on success, -1 on error */
4561static
4562int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004563 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004565 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004566 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567{
4568 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004569 Py_ssize_t repsize;
4570 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 Py_UNICODE *uni2;
4572 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004573 Py_ssize_t collstartpos = *inpos;
4574 Py_ssize_t collendpos = *inpos+1;
4575 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 char *encoding = "charmap";
4577 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004578 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 /* find all unencodable characters */
4581 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004582 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004583 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004584 int res = encoding_map_lookup(p[collendpos], mapping);
4585 if (res != -1)
4586 break;
4587 ++collendpos;
4588 continue;
4589 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004590
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004591 rep = charmapencode_lookup(p[collendpos], mapping);
4592 if (rep==NULL)
4593 return -1;
4594 else if (rep!=Py_None) {
4595 Py_DECREF(rep);
4596 break;
4597 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004598 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004599 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 }
4601 /* cache callback name lookup
4602 * (if not done yet, i.e. it's the first error) */
4603 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004604 if ((errors==NULL) || (!strcmp(errors, "strict")))
4605 *known_errorHandler = 1;
4606 else if (!strcmp(errors, "replace"))
4607 *known_errorHandler = 2;
4608 else if (!strcmp(errors, "ignore"))
4609 *known_errorHandler = 3;
4610 else if (!strcmp(errors, "xmlcharrefreplace"))
4611 *known_errorHandler = 4;
4612 else
4613 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 }
4615 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004616 case 1: /* strict */
4617 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4618 return -1;
4619 case 2: /* replace */
4620 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004621 x = charmapencode_output('?', mapping, res, respos);
4622 if (x==enc_EXCEPTION) {
4623 return -1;
4624 }
4625 else if (x==enc_FAILED) {
4626 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4627 return -1;
4628 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004629 }
4630 /* fall through */
4631 case 3: /* ignore */
4632 *inpos = collendpos;
4633 break;
4634 case 4: /* xmlcharrefreplace */
4635 /* generate replacement (temporarily (mis)uses p) */
4636 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004637 char buffer[2+29+1+1];
4638 char *cp;
4639 sprintf(buffer, "&#%d;", (int)p[collpos]);
4640 for (cp = buffer; *cp; ++cp) {
4641 x = charmapencode_output(*cp, mapping, res, respos);
4642 if (x==enc_EXCEPTION)
4643 return -1;
4644 else if (x==enc_FAILED) {
4645 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4646 return -1;
4647 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004648 }
4649 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004650 *inpos = collendpos;
4651 break;
4652 default:
4653 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004654 encoding, reason, p, size, exceptionObject,
4655 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004656 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004657 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004658 /* generate replacement */
4659 repsize = PyUnicode_GET_SIZE(repunicode);
4660 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004661 x = charmapencode_output(*uni2, mapping, res, respos);
4662 if (x==enc_EXCEPTION) {
4663 return -1;
4664 }
4665 else if (x==enc_FAILED) {
4666 Py_DECREF(repunicode);
4667 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4668 return -1;
4669 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004670 }
4671 *inpos = newpos;
4672 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673 }
4674 return 0;
4675}
4676
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004678 Py_ssize_t size,
4679 PyObject *mapping,
4680 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 /* output object */
4683 PyObject *res = NULL;
4684 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004685 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004687 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688 PyObject *errorHandler = NULL;
4689 PyObject *exc = NULL;
4690 /* the following variable is used for caching string comparisons
4691 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4692 * 3=ignore, 4=xmlcharrefreplace */
4693 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694
4695 /* Default to Latin-1 */
4696 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004697 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 /* allocate enough for a simple encoding without
4700 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004701 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004702 if (res == NULL)
4703 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004704 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004705 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004707 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004708 /* try to encode it */
4709 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4710 if (x==enc_EXCEPTION) /* error */
4711 goto onError;
4712 if (x==enc_FAILED) { /* unencodable character */
4713 if (charmap_encoding_error(p, size, &inpos, mapping,
4714 &exc,
4715 &known_errorHandler, &errorHandler, errors,
4716 &res, &respos)) {
4717 goto onError;
4718 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004719 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004720 else
4721 /* done with this character => adjust input position */
4722 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004725 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004726 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004727 if (_PyString_Resize(&res, respos))
4728 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004729 }
4730 Py_XDECREF(exc);
4731 Py_XDECREF(errorHandler);
4732 return res;
4733
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004734 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 Py_XDECREF(res);
4736 Py_XDECREF(exc);
4737 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738 return NULL;
4739}
4740
4741PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004742 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743{
4744 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004745 PyErr_BadArgument();
4746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 }
4748 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004749 PyUnicode_GET_SIZE(unicode),
4750 mapping,
4751 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752}
4753
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754/* create or adjust a UnicodeTranslateError */
4755static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004756 const Py_UNICODE *unicode, Py_ssize_t size,
4757 Py_ssize_t startpos, Py_ssize_t endpos,
4758 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004760 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004761 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004762 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763 }
4764 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004765 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4766 goto onError;
4767 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4768 goto onError;
4769 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4770 goto onError;
4771 return;
4772 onError:
4773 Py_DECREF(*exceptionObject);
4774 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 }
4776}
4777
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004778/* raises a UnicodeTranslateError */
4779static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004780 const Py_UNICODE *unicode, Py_ssize_t size,
4781 Py_ssize_t startpos, Py_ssize_t endpos,
4782 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004783{
4784 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004785 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004787 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004788}
4789
4790/* error handling callback helper:
4791 build arguments, call the callback and check the arguments,
4792 put the result into newpos and return the replacement string, which
4793 has to be freed by the caller */
4794static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004795 PyObject **errorHandler,
4796 const char *reason,
4797 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4798 Py_ssize_t startpos, Py_ssize_t endpos,
4799 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004801 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802
Martin v. Löwis412fb672006-04-13 06:34:32 +00004803 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 PyObject *restuple;
4805 PyObject *resunicode;
4806
4807 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004808 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004810 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 }
4812
4813 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004814 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004815 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004816 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817
4818 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004819 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004821 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004823 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004824 Py_DECREF(restuple);
4825 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 }
4827 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004828 &resunicode, &i_newpos)) {
4829 Py_DECREF(restuple);
4830 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004831 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004832 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004833 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004834 else
4835 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004836 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004837 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4838 Py_DECREF(restuple);
4839 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004840 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004841 Py_INCREF(resunicode);
4842 Py_DECREF(restuple);
4843 return resunicode;
4844}
4845
4846/* Lookup the character ch in the mapping and put the result in result,
4847 which must be decrefed by the caller.
4848 Return 0 on success, -1 on error */
4849static
4850int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4851{
4852 PyObject *w = PyInt_FromLong((long)c);
4853 PyObject *x;
4854
4855 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004856 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857 x = PyObject_GetItem(mapping, w);
4858 Py_DECREF(w);
4859 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004860 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4861 /* No mapping found means: use 1:1 mapping. */
4862 PyErr_Clear();
4863 *result = NULL;
4864 return 0;
4865 } else
4866 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867 }
4868 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004869 *result = x;
4870 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 }
4872 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004873 long value = PyInt_AS_LONG(x);
4874 long max = PyUnicode_GetMax();
4875 if (value < 0 || value > max) {
4876 PyErr_Format(PyExc_TypeError,
4877 "character mapping must be in range(0x%lx)", max+1);
4878 Py_DECREF(x);
4879 return -1;
4880 }
4881 *result = x;
4882 return 0;
4883 }
4884 else if (PyUnicode_Check(x)) {
4885 *result = x;
4886 return 0;
4887 }
4888 else {
4889 /* wrong return value */
4890 PyErr_SetString(PyExc_TypeError,
4891 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004892 Py_DECREF(x);
4893 return -1;
4894 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895}
4896/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004897 if not reallocate and adjust various state variables.
4898 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899static
Walter Dörwald4894c302003-10-24 14:25:28 +00004900int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004901 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004903 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004904 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004905 /* remember old output position */
4906 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4907 /* exponentially overallocate to minimize reallocations */
4908 if (requiredsize < 2 * oldsize)
4909 requiredsize = 2 * oldsize;
4910 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4911 return -1;
4912 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004913 }
4914 return 0;
4915}
4916/* lookup the character, put the result in the output string and adjust
4917 various state variables. Return a new reference to the object that
4918 was put in the output buffer in *result, or Py_None, if the mapping was
4919 undefined (in which case no character was written).
4920 The called must decref result.
4921 Return 0 on success, -1 on error. */
4922static
Walter Dörwald4894c302003-10-24 14:25:28 +00004923int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004924 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4925 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926{
Walter Dörwald4894c302003-10-24 14:25:28 +00004927 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004928 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004930 /* not found => default to 1:1 mapping */
4931 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004932 }
4933 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004934 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004935 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004936 /* no overflow check, because we know that the space is enough */
4937 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004938 }
4939 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004940 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4941 if (repsize==1) {
4942 /* no overflow check, because we know that the space is enough */
4943 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4944 }
4945 else if (repsize!=0) {
4946 /* more than one character */
4947 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4948 (insize - (curinp-startinp)) +
4949 repsize - 1;
4950 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4951 return -1;
4952 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4953 *outp += repsize;
4954 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 }
4956 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004957 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958 return 0;
4959}
4960
4961PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004962 Py_ssize_t size,
4963 PyObject *mapping,
4964 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004966 /* output object */
4967 PyObject *res = NULL;
4968 /* pointers to the beginning and end+1 of input */
4969 const Py_UNICODE *startp = p;
4970 const Py_UNICODE *endp = p + size;
4971 /* pointer into the output */
4972 Py_UNICODE *str;
4973 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004974 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004975 char *reason = "character maps to <undefined>";
4976 PyObject *errorHandler = NULL;
4977 PyObject *exc = NULL;
4978 /* the following variable is used for caching string comparisons
4979 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4980 * 3=ignore, 4=xmlcharrefreplace */
4981 int known_errorHandler = -1;
4982
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004984 PyErr_BadArgument();
4985 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004987
4988 /* allocate enough for a simple 1:1 translation without
4989 replacements, if we need more, we'll resize */
4990 res = PyUnicode_FromUnicode(NULL, size);
4991 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004992 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004994 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004997 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004998 /* try to encode it */
4999 PyObject *x = NULL;
5000 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5001 Py_XDECREF(x);
5002 goto onError;
5003 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005004 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005005 if (x!=Py_None) /* it worked => adjust input pointer */
5006 ++p;
5007 else { /* untranslatable character */
5008 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5009 Py_ssize_t repsize;
5010 Py_ssize_t newpos;
5011 Py_UNICODE *uni2;
5012 /* startpos for collecting untranslatable chars */
5013 const Py_UNICODE *collstart = p;
5014 const Py_UNICODE *collend = p+1;
5015 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005017 /* find all untranslatable characters */
5018 while (collend < endp) {
5019 if (charmaptranslate_lookup(*collend, mapping, &x))
5020 goto onError;
5021 Py_XDECREF(x);
5022 if (x!=Py_None)
5023 break;
5024 ++collend;
5025 }
5026 /* cache callback name lookup
5027 * (if not done yet, i.e. it's the first error) */
5028 if (known_errorHandler==-1) {
5029 if ((errors==NULL) || (!strcmp(errors, "strict")))
5030 known_errorHandler = 1;
5031 else if (!strcmp(errors, "replace"))
5032 known_errorHandler = 2;
5033 else if (!strcmp(errors, "ignore"))
5034 known_errorHandler = 3;
5035 else if (!strcmp(errors, "xmlcharrefreplace"))
5036 known_errorHandler = 4;
5037 else
5038 known_errorHandler = 0;
5039 }
5040 switch (known_errorHandler) {
5041 case 1: /* strict */
5042 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005043 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005044 case 2: /* replace */
5045 /* No need to check for space, this is a 1:1 replacement */
5046 for (coll = collstart; coll<collend; ++coll)
5047 *str++ = '?';
5048 /* fall through */
5049 case 3: /* ignore */
5050 p = collend;
5051 break;
5052 case 4: /* xmlcharrefreplace */
5053 /* generate replacement (temporarily (mis)uses p) */
5054 for (p = collstart; p < collend; ++p) {
5055 char buffer[2+29+1+1];
5056 char *cp;
5057 sprintf(buffer, "&#%d;", (int)*p);
5058 if (charmaptranslate_makespace(&res, &str,
5059 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5060 goto onError;
5061 for (cp = buffer; *cp; ++cp)
5062 *str++ = *cp;
5063 }
5064 p = collend;
5065 break;
5066 default:
5067 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5068 reason, startp, size, &exc,
5069 collstart-startp, collend-startp, &newpos);
5070 if (repunicode == NULL)
5071 goto onError;
5072 /* generate replacement */
5073 repsize = PyUnicode_GET_SIZE(repunicode);
5074 if (charmaptranslate_makespace(&res, &str,
5075 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5076 Py_DECREF(repunicode);
5077 goto onError;
5078 }
5079 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5080 *str++ = *uni2;
5081 p = startp + newpos;
5082 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005083 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005084 }
5085 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005086 /* Resize if we allocated to much */
5087 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005088 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005089 if (PyUnicode_Resize(&res, respos) < 0)
5090 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005091 }
5092 Py_XDECREF(exc);
5093 Py_XDECREF(errorHandler);
5094 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005096 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005097 Py_XDECREF(res);
5098 Py_XDECREF(exc);
5099 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100 return NULL;
5101}
5102
5103PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005104 PyObject *mapping,
5105 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106{
5107 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005108
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 str = PyUnicode_FromObject(str);
5110 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005111 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005113 PyUnicode_GET_SIZE(str),
5114 mapping,
5115 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116 Py_DECREF(str);
5117 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005118
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005119 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 Py_XDECREF(str);
5121 return NULL;
5122}
Tim Petersced69f82003-09-16 20:30:58 +00005123
Guido van Rossum9e896b32000-04-05 20:11:21 +00005124/* --- Decimal Encoder ---------------------------------------------------- */
5125
5126int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005127 Py_ssize_t length,
5128 char *output,
5129 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005130{
5131 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005132 PyObject *errorHandler = NULL;
5133 PyObject *exc = NULL;
5134 const char *encoding = "decimal";
5135 const char *reason = "invalid decimal Unicode string";
5136 /* the following variable is used for caching string comparisons
5137 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5138 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005139
5140 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005141 PyErr_BadArgument();
5142 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005143 }
5144
5145 p = s;
5146 end = s + length;
5147 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005148 register Py_UNICODE ch = *p;
5149 int decimal;
5150 PyObject *repunicode;
5151 Py_ssize_t repsize;
5152 Py_ssize_t newpos;
5153 Py_UNICODE *uni2;
5154 Py_UNICODE *collstart;
5155 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005156
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005157 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005158 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005159 ++p;
5160 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005161 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005162 decimal = Py_UNICODE_TODECIMAL(ch);
5163 if (decimal >= 0) {
5164 *output++ = '0' + decimal;
5165 ++p;
5166 continue;
5167 }
5168 if (0 < ch && ch < 256) {
5169 *output++ = (char)ch;
5170 ++p;
5171 continue;
5172 }
5173 /* All other characters are considered unencodable */
5174 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005175 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005176 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005177 Py_UNICODE_ISSPACE(*collend) ||
5178 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005179 break;
5180 }
5181 /* cache callback name lookup
5182 * (if not done yet, i.e. it's the first error) */
5183 if (known_errorHandler==-1) {
5184 if ((errors==NULL) || (!strcmp(errors, "strict")))
5185 known_errorHandler = 1;
5186 else if (!strcmp(errors, "replace"))
5187 known_errorHandler = 2;
5188 else if (!strcmp(errors, "ignore"))
5189 known_errorHandler = 3;
5190 else if (!strcmp(errors, "xmlcharrefreplace"))
5191 known_errorHandler = 4;
5192 else
5193 known_errorHandler = 0;
5194 }
5195 switch (known_errorHandler) {
5196 case 1: /* strict */
5197 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5198 goto onError;
5199 case 2: /* replace */
5200 for (p = collstart; p < collend; ++p)
5201 *output++ = '?';
5202 /* fall through */
5203 case 3: /* ignore */
5204 p = collend;
5205 break;
5206 case 4: /* xmlcharrefreplace */
5207 /* generate replacement (temporarily (mis)uses p) */
5208 for (p = collstart; p < collend; ++p)
5209 output += sprintf(output, "&#%d;", (int)*p);
5210 p = collend;
5211 break;
5212 default:
5213 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5214 encoding, reason, s, length, &exc,
5215 collstart-s, collend-s, &newpos);
5216 if (repunicode == NULL)
5217 goto onError;
5218 /* generate replacement */
5219 repsize = PyUnicode_GET_SIZE(repunicode);
5220 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5221 Py_UNICODE ch = *uni2;
5222 if (Py_UNICODE_ISSPACE(ch))
5223 *output++ = ' ';
5224 else {
5225 decimal = Py_UNICODE_TODECIMAL(ch);
5226 if (decimal >= 0)
5227 *output++ = '0' + decimal;
5228 else if (0 < ch && ch < 256)
5229 *output++ = (char)ch;
5230 else {
5231 Py_DECREF(repunicode);
5232 raise_encode_exception(&exc, encoding,
5233 s, length, collstart-s, collend-s, reason);
5234 goto onError;
5235 }
5236 }
5237 }
5238 p = s + newpos;
5239 Py_DECREF(repunicode);
5240 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005241 }
5242 /* 0-terminate the output string */
5243 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244 Py_XDECREF(exc);
5245 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005246 return 0;
5247
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005248 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005249 Py_XDECREF(exc);
5250 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005251 return -1;
5252}
5253
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254/* --- Helpers ------------------------------------------------------------ */
5255
Eric Smitha9f7d622008-02-17 19:46:49 +00005256#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005257#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005258
5259#include "stringlib/count.h"
5260#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005261#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005262#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005263
Fredrik Lundhc8162812006-05-26 19:33:03 +00005264/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005265#define ADJUST_INDICES(start, end, len) \
5266 if (end > len) \
5267 end = len; \
5268 else if (end < 0) { \
5269 end += len; \
5270 if (end < 0) \
5271 end = 0; \
5272 } \
5273 if (start < 0) { \
5274 start += len; \
5275 if (start < 0) \
5276 start = 0; \
5277 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005278
Martin v. Löwis18e16552006-02-15 17:27:45 +00005279Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005280 PyObject *substr,
5281 Py_ssize_t start,
5282 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005284 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005285 PyUnicodeObject* str_obj;
5286 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005287
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005288 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5289 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005290 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005291 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5292 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005293 Py_DECREF(str_obj);
5294 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 }
Tim Petersced69f82003-09-16 20:30:58 +00005296
Antoine Pitrou64672132010-01-13 07:55:48 +00005297 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005298 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005299 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5300 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005301 );
5302
5303 Py_DECREF(sub_obj);
5304 Py_DECREF(str_obj);
5305
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 return result;
5307}
5308
Martin v. Löwis18e16552006-02-15 17:27:45 +00005309Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005310 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005311 Py_ssize_t start,
5312 Py_ssize_t end,
5313 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005315 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005316
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005317 str = PyUnicode_FromObject(str);
5318 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005319 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005320 sub = PyUnicode_FromObject(sub);
5321 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005322 Py_DECREF(str);
5323 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 }
Tim Petersced69f82003-09-16 20:30:58 +00005325
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005326 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005327 result = stringlib_find_slice(
5328 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5329 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5330 start, end
5331 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005332 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005333 result = stringlib_rfind_slice(
5334 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5335 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5336 start, end
5337 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005338
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005339 Py_DECREF(str);
5340 Py_DECREF(sub);
5341
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 return result;
5343}
5344
Tim Petersced69f82003-09-16 20:30:58 +00005345static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005347 PyUnicodeObject *substring,
5348 Py_ssize_t start,
5349 Py_ssize_t end,
5350 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 if (substring->length == 0)
5353 return 1;
5354
Antoine Pitrou64672132010-01-13 07:55:48 +00005355 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 end -= substring->length;
5357 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005358 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359
5360 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005361 if (Py_UNICODE_MATCH(self, end, substring))
5362 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 } else {
5364 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005365 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 }
5367
5368 return 0;
5369}
5370
Martin v. Löwis18e16552006-02-15 17:27:45 +00005371Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005372 PyObject *substr,
5373 Py_ssize_t start,
5374 Py_ssize_t end,
5375 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005377 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005378
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 str = PyUnicode_FromObject(str);
5380 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005381 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 substr = PyUnicode_FromObject(substr);
5383 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005384 Py_DECREF(str);
5385 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 }
Tim Petersced69f82003-09-16 20:30:58 +00005387
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005389 (PyUnicodeObject *)substr,
5390 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 Py_DECREF(str);
5392 Py_DECREF(substr);
5393 return result;
5394}
5395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396/* Apply fixfct filter to the Unicode object self and return a
5397 reference to the modified object */
5398
Tim Petersced69f82003-09-16 20:30:58 +00005399static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005401 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402{
5403
5404 PyUnicodeObject *u;
5405
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005406 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005408 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005409
5410 Py_UNICODE_COPY(u->str, self->str, self->length);
5411
Tim Peters7a29bd52001-09-12 03:03:31 +00005412 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005413 /* fixfct should return TRUE if it modified the buffer. If
5414 FALSE, return a reference to the original buffer instead
5415 (to save space, not time) */
5416 Py_INCREF(self);
5417 Py_DECREF(u);
5418 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 }
5420 return (PyObject*) u;
5421}
5422
Tim Petersced69f82003-09-16 20:30:58 +00005423static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424int fixupper(PyUnicodeObject *self)
5425{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005426 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 Py_UNICODE *s = self->str;
5428 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005429
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005431 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005432
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005433 ch = Py_UNICODE_TOUPPER(*s);
5434 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005436 *s = ch;
5437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 s++;
5439 }
5440
5441 return status;
5442}
5443
Tim Petersced69f82003-09-16 20:30:58 +00005444static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445int fixlower(PyUnicodeObject *self)
5446{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005447 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 Py_UNICODE *s = self->str;
5449 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005450
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005452 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005453
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005454 ch = Py_UNICODE_TOLOWER(*s);
5455 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005457 *s = ch;
5458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 s++;
5460 }
5461
5462 return status;
5463}
5464
Tim Petersced69f82003-09-16 20:30:58 +00005465static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466int fixswapcase(PyUnicodeObject *self)
5467{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005468 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 Py_UNICODE *s = self->str;
5470 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005471
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 while (len-- > 0) {
5473 if (Py_UNICODE_ISUPPER(*s)) {
5474 *s = Py_UNICODE_TOLOWER(*s);
5475 status = 1;
5476 } else if (Py_UNICODE_ISLOWER(*s)) {
5477 *s = Py_UNICODE_TOUPPER(*s);
5478 status = 1;
5479 }
5480 s++;
5481 }
5482
5483 return status;
5484}
5485
Tim Petersced69f82003-09-16 20:30:58 +00005486static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487int fixcapitalize(PyUnicodeObject *self)
5488{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005489 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005490 Py_UNICODE *s = self->str;
5491 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005492
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005493 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005494 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005495 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005496 *s = Py_UNICODE_TOUPPER(*s);
5497 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005499 s++;
5500 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005501 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005502 *s = Py_UNICODE_TOLOWER(*s);
5503 status = 1;
5504 }
5505 s++;
5506 }
5507 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508}
5509
5510static
5511int fixtitle(PyUnicodeObject *self)
5512{
5513 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5514 register Py_UNICODE *e;
5515 int previous_is_cased;
5516
5517 /* Shortcut for single character strings */
5518 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005519 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5520 if (*p != ch) {
5521 *p = ch;
5522 return 1;
5523 }
5524 else
5525 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 }
Tim Petersced69f82003-09-16 20:30:58 +00005527
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 e = p + PyUnicode_GET_SIZE(self);
5529 previous_is_cased = 0;
5530 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005531 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005532
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005533 if (previous_is_cased)
5534 *p = Py_UNICODE_TOLOWER(ch);
5535 else
5536 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005537
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005538 if (Py_UNICODE_ISLOWER(ch) ||
5539 Py_UNICODE_ISUPPER(ch) ||
5540 Py_UNICODE_ISTITLE(ch))
5541 previous_is_cased = 1;
5542 else
5543 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 }
5545 return 1;
5546}
5547
Tim Peters8ce9f162004-08-27 01:49:32 +00005548PyObject *
5549PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550{
Tim Peters8ce9f162004-08-27 01:49:32 +00005551 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005552 const Py_UNICODE blank = ' ';
5553 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005554 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005555 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005556 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5557 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005558 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5559 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005560 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005561 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005562 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
Tim Peters05eba1f2004-08-27 21:32:02 +00005564 fseq = PySequence_Fast(seq, "");
5565 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005566 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005567 }
5568
Tim Peters91879ab2004-08-27 22:35:44 +00005569 /* Grrrr. A codec may be invoked to convert str objects to
5570 * Unicode, and so it's possible to call back into Python code
5571 * during PyUnicode_FromObject(), and so it's possible for a sick
5572 * codec to change the size of fseq (if seq is a list). Therefore
5573 * we have to keep refetching the size -- can't assume seqlen
5574 * is invariant.
5575 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005576 seqlen = PySequence_Fast_GET_SIZE(fseq);
5577 /* If empty sequence, return u"". */
5578 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005579 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5580 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005581 }
5582 /* If singleton sequence with an exact Unicode, return that. */
5583 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005584 item = PySequence_Fast_GET_ITEM(fseq, 0);
5585 if (PyUnicode_CheckExact(item)) {
5586 Py_INCREF(item);
5587 res = (PyUnicodeObject *)item;
5588 goto Done;
5589 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005590 }
5591
Tim Peters05eba1f2004-08-27 21:32:02 +00005592 /* At least two items to join, or one that isn't exact Unicode. */
5593 if (seqlen > 1) {
5594 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005595 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005596 sep = &blank;
5597 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005598 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005599 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005600 internal_separator = PyUnicode_FromObject(separator);
5601 if (internal_separator == NULL)
5602 goto onError;
5603 sep = PyUnicode_AS_UNICODE(internal_separator);
5604 seplen = PyUnicode_GET_SIZE(internal_separator);
5605 /* In case PyUnicode_FromObject() mutated seq. */
5606 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005607 }
5608 }
5609
5610 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005611 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005612 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005613 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005614 res_p = PyUnicode_AS_UNICODE(res);
5615 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005616
Tim Peters05eba1f2004-08-27 21:32:02 +00005617 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005618 Py_ssize_t itemlen;
5619 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005620
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005621 item = PySequence_Fast_GET_ITEM(fseq, i);
5622 /* Convert item to Unicode. */
5623 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5624 PyErr_Format(PyExc_TypeError,
5625 "sequence item %zd: expected string or Unicode,"
5626 " %.80s found",
5627 i, Py_TYPE(item)->tp_name);
5628 goto onError;
5629 }
5630 item = PyUnicode_FromObject(item);
5631 if (item == NULL)
5632 goto onError;
5633 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005634
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005635 /* In case PyUnicode_FromObject() mutated seq. */
5636 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005637
Tim Peters8ce9f162004-08-27 01:49:32 +00005638 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005639 itemlen = PyUnicode_GET_SIZE(item);
5640 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005641 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005642 goto Overflow;
5643 if (i < seqlen - 1) {
5644 new_res_used += seplen;
5645 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005646 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005647 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005648 if (new_res_used > res_alloc) {
5649 /* double allocated size until it's big enough */
5650 do {
5651 res_alloc += res_alloc;
5652 if (res_alloc <= 0)
5653 goto Overflow;
5654 } while (new_res_used > res_alloc);
5655 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5656 Py_DECREF(item);
5657 goto onError;
5658 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005659 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005660 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005661
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005662 /* Copy item, and maybe the separator. */
5663 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5664 res_p += itemlen;
5665 if (i < seqlen - 1) {
5666 Py_UNICODE_COPY(res_p, sep, seplen);
5667 res_p += seplen;
5668 }
5669 Py_DECREF(item);
5670 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005671 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005672
Tim Peters05eba1f2004-08-27 21:32:02 +00005673 /* Shrink res to match the used area; this probably can't fail,
5674 * but it's cheap to check.
5675 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005676 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005677 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005678
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005679 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005680 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005681 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 return (PyObject *)res;
5683
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005684 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005685 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005686 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005687 Py_DECREF(item);
5688 /* fall through */
5689
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005690 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005691 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005692 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005693 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 return NULL;
5695}
5696
Tim Petersced69f82003-09-16 20:30:58 +00005697static
5698PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005699 Py_ssize_t left,
5700 Py_ssize_t right,
5701 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702{
5703 PyUnicodeObject *u;
5704
5705 if (left < 0)
5706 left = 0;
5707 if (right < 0)
5708 right = 0;
5709
Tim Peters7a29bd52001-09-12 03:03:31 +00005710 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 Py_INCREF(self);
5712 return self;
5713 }
5714
Neal Norwitze7d8be82008-07-31 17:17:14 +00005715 if (left > PY_SSIZE_T_MAX - self->length ||
5716 right > PY_SSIZE_T_MAX - (left + self->length)) {
5717 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5718 return NULL;
5719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 u = _PyUnicode_New(left + self->length + right);
5721 if (u) {
5722 if (left)
5723 Py_UNICODE_FILL(u->str, fill, left);
5724 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5725 if (right)
5726 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5727 }
5728
5729 return u;
5730}
5731
Antoine Pitrou64672132010-01-13 07:55:48 +00005732PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
5736 string = PyUnicode_FromObject(string);
5737 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005738 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739
Antoine Pitrou64672132010-01-13 07:55:48 +00005740 list = stringlib_splitlines(
5741 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5742 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743
5744 Py_DECREF(string);
5745 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746}
5747
Tim Petersced69f82003-09-16 20:30:58 +00005748static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005750 PyUnicodeObject *substring,
5751 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005754 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005757 return stringlib_split_whitespace(
5758 (PyObject*) self, self->str, self->length, maxcount
5759 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760
Antoine Pitrou64672132010-01-13 07:55:48 +00005761 return stringlib_split(
5762 (PyObject*) self, self->str, self->length,
5763 substring->str, substring->length,
5764 maxcount
5765 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766}
5767
Tim Petersced69f82003-09-16 20:30:58 +00005768static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005769PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005770 PyUnicodeObject *substring,
5771 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005772{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005773 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005774 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005776 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005777 return stringlib_rsplit_whitespace(
5778 (PyObject*) self, self->str, self->length, maxcount
5779 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005780
Antoine Pitrou64672132010-01-13 07:55:48 +00005781 return stringlib_rsplit(
5782 (PyObject*) self, self->str, self->length,
5783 substring->str, substring->length,
5784 maxcount
5785 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005786}
5787
5788static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005790 PyUnicodeObject *str1,
5791 PyUnicodeObject *str2,
5792 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793{
5794 PyUnicodeObject *u;
5795
5796 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005797 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005798 else if (maxcount == 0 || self->length == 0)
5799 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
Fredrik Lundh347ee272006-05-24 16:35:18 +00005801 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005802 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005803 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005804 if (str1->length == 0)
5805 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005806 if (str1->length == 1) {
5807 /* replace characters */
5808 Py_UNICODE u1, u2;
5809 if (!findchar(self->str, self->length, str1->str[0]))
5810 goto nothing;
5811 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5812 if (!u)
5813 return NULL;
5814 Py_UNICODE_COPY(u->str, self->str, self->length);
5815 u1 = str1->str[0];
5816 u2 = str2->str[0];
5817 for (i = 0; i < u->length; i++)
5818 if (u->str[i] == u1) {
5819 if (--maxcount < 0)
5820 break;
5821 u->str[i] = u2;
5822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005824 i = stringlib_find(
5825 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005827 if (i < 0)
5828 goto nothing;
5829 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5830 if (!u)
5831 return NULL;
5832 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005833
5834 /* change everything in-place, starting with this one */
5835 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5836 i += str1->length;
5837
5838 while ( --maxcount > 0) {
5839 i = stringlib_find(self->str+i, self->length-i,
5840 str1->str, str1->length,
5841 i);
5842 if (i == -1)
5843 break;
5844 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5845 i += str1->length;
5846 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005849
Brett Cannona7f13ee2010-05-04 01:16:51 +00005850 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005851 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852 Py_UNICODE *p;
5853
5854 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005855 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5856 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005857 if (n == 0)
5858 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005859 /* new_size = self->length + n * (str2->length - str1->length)); */
5860 delta = (str2->length - str1->length);
5861 if (delta == 0) {
5862 new_size = self->length;
5863 } else {
5864 product = n * (str2->length - str1->length);
5865 if ((product / (str2->length - str1->length)) != n) {
5866 PyErr_SetString(PyExc_OverflowError,
5867 "replace string is too long");
5868 return NULL;
5869 }
5870 new_size = self->length + product;
5871 if (new_size < 0) {
5872 PyErr_SetString(PyExc_OverflowError,
5873 "replace string is too long");
5874 return NULL;
5875 }
5876 }
5877 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005878 if (!u)
5879 return NULL;
5880 i = 0;
5881 p = u->str;
5882 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005883 while (n-- > 0) {
5884 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005885 j = stringlib_find(self->str+i, self->length-i,
5886 str1->str, str1->length,
5887 i);
5888 if (j == -1)
5889 break;
5890 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005891 /* copy unchanged part [i:j] */
5892 Py_UNICODE_COPY(p, self->str+i, j-i);
5893 p += j - i;
5894 }
5895 /* copy substitution string */
5896 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005897 Py_UNICODE_COPY(p, str2->str, str2->length);
5898 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005899 }
5900 i = j + str1->length;
5901 }
5902 if (i < self->length)
5903 /* copy tail [i:] */
5904 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005905 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005906 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005907 while (n > 0) {
5908 Py_UNICODE_COPY(p, str2->str, str2->length);
5909 p += str2->length;
5910 if (--n <= 0)
5911 break;
5912 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005914 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 }
5916 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005918
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005919 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005920 /* nothing to replace; return original string (when possible) */
5921 if (PyUnicode_CheckExact(self)) {
5922 Py_INCREF(self);
5923 return (PyObject *) self;
5924 }
5925 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926}
5927
5928/* --- Unicode Object Methods --------------------------------------------- */
5929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005930PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005931 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932\n\
5933Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005934characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935
5936static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005937unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 return fixup(self, fixtitle);
5940}
5941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005942PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005943 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944\n\
5945Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00005946have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947
5948static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005949unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 return fixup(self, fixcapitalize);
5952}
5953
5954#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005955PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005956 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957\n\
5958Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005959normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
5961static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005962unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963{
5964 PyObject *list;
5965 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005966 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 /* Split into words */
5969 list = split(self, NULL, -1);
5970 if (!list)
5971 return NULL;
5972
5973 /* Capitalize each word */
5974 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5975 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005976 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 if (item == NULL)
5978 goto onError;
5979 Py_DECREF(PyList_GET_ITEM(list, i));
5980 PyList_SET_ITEM(list, i, item);
5981 }
5982
5983 /* Join the words to form a new string */
5984 item = PyUnicode_Join(NULL, list);
5985
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005986 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 Py_DECREF(list);
5988 return (PyObject *)item;
5989}
5990#endif
5991
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005992/* Argument converter. Coerces to a single unicode character */
5993
5994static int
5995convert_uc(PyObject *obj, void *addr)
5996{
Benjamin Peterson857ce152009-01-31 16:29:18 +00005997 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5998 PyObject *uniobj;
5999 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006000
Benjamin Peterson857ce152009-01-31 16:29:18 +00006001 uniobj = PyUnicode_FromObject(obj);
6002 if (uniobj == NULL) {
6003 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006004 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006005 return 0;
6006 }
6007 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6008 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006009 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006010 Py_DECREF(uniobj);
6011 return 0;
6012 }
6013 unistr = PyUnicode_AS_UNICODE(uniobj);
6014 *fillcharloc = unistr[0];
6015 Py_DECREF(uniobj);
6016 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006017}
6018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006019PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006020 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006022Return S centered in a Unicode string of length width. Padding is\n\
6023done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
6025static PyObject *
6026unicode_center(PyUnicodeObject *self, PyObject *args)
6027{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006028 Py_ssize_t marg, left;
6029 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006030 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031
Thomas Woutersde017742006-02-16 19:34:37 +00006032 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 return NULL;
6034
Tim Peters7a29bd52001-09-12 03:03:31 +00006035 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 Py_INCREF(self);
6037 return (PyObject*) self;
6038 }
6039
6040 marg = width - self->length;
6041 left = marg / 2 + (marg & width & 1);
6042
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006043 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044}
6045
Marc-André Lemburge5034372000-08-08 08:04:29 +00006046#if 0
6047
6048/* This code should go into some future Unicode collation support
6049 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006050 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006051
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006052/* speedy UTF-16 code point order comparison */
6053/* gleaned from: */
6054/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6055
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006056static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006057{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006058 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006059 0, 0, 0, 0, 0, 0, 0, 0,
6060 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006061 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006062};
6063
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064static int
6065unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6066{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006067 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 Py_UNICODE *s1 = str1->str;
6070 Py_UNICODE *s2 = str2->str;
6071
6072 len1 = str1->length;
6073 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006074
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006076 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006077
6078 c1 = *s1++;
6079 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006080
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006081 if (c1 > (1<<11) * 26)
6082 c1 += utf16Fixup[c1>>11];
6083 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006084 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006085 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006086
6087 if (c1 != c2)
6088 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006089
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006090 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 }
6092
6093 return (len1 < len2) ? -1 : (len1 != len2);
6094}
6095
Marc-André Lemburge5034372000-08-08 08:04:29 +00006096#else
6097
6098static int
6099unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6100{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006101 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006102
6103 Py_UNICODE *s1 = str1->str;
6104 Py_UNICODE *s2 = str2->str;
6105
6106 len1 = str1->length;
6107 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006108
Marc-André Lemburge5034372000-08-08 08:04:29 +00006109 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006110 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006111
Fredrik Lundh45714e92001-06-26 16:39:36 +00006112 c1 = *s1++;
6113 c2 = *s2++;
6114
6115 if (c1 != c2)
6116 return (c1 < c2) ? -1 : 1;
6117
Marc-André Lemburge5034372000-08-08 08:04:29 +00006118 len1--; len2--;
6119 }
6120
6121 return (len1 < len2) ? -1 : (len1 != len2);
6122}
6123
6124#endif
6125
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006127 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128{
6129 PyUnicodeObject *u = NULL, *v = NULL;
6130 int result;
6131
6132 /* Coerce the two arguments */
6133 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6134 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006135 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6137 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006138 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
Thomas Wouters7e474022000-07-16 12:04:32 +00006140 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006142 Py_DECREF(u);
6143 Py_DECREF(v);
6144 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 }
6146
6147 result = unicode_compare(u, v);
6148
6149 Py_DECREF(u);
6150 Py_DECREF(v);
6151 return result;
6152
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006153 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 Py_XDECREF(u);
6155 Py_XDECREF(v);
6156 return -1;
6157}
6158
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006159PyObject *PyUnicode_RichCompare(PyObject *left,
6160 PyObject *right,
6161 int op)
6162{
6163 int result;
6164
6165 result = PyUnicode_Compare(left, right);
6166 if (result == -1 && PyErr_Occurred())
6167 goto onError;
6168
6169 /* Convert the return value to a Boolean */
6170 switch (op) {
6171 case Py_EQ:
6172 result = (result == 0);
6173 break;
6174 case Py_NE:
6175 result = (result != 0);
6176 break;
6177 case Py_LE:
6178 result = (result <= 0);
6179 break;
6180 case Py_GE:
6181 result = (result >= 0);
6182 break;
6183 case Py_LT:
6184 result = (result == -1);
6185 break;
6186 case Py_GT:
6187 result = (result == 1);
6188 break;
6189 }
6190 return PyBool_FromLong(result);
6191
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006192 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006193
6194 /* Standard case
6195
6196 Type errors mean that PyUnicode_FromObject() could not convert
6197 one of the arguments (usually the right hand side) to Unicode,
6198 ie. we can't handle the comparison request. However, it is
6199 possible that the other object knows a comparison method, which
6200 is why we return Py_NotImplemented to give the other object a
6201 chance.
6202
6203 */
6204 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6205 PyErr_Clear();
6206 Py_INCREF(Py_NotImplemented);
6207 return Py_NotImplemented;
6208 }
6209 if (op != Py_EQ && op != Py_NE)
6210 return NULL;
6211
6212 /* Equality comparison.
6213
6214 This is a special case: we silence any PyExc_UnicodeDecodeError
6215 and instead turn it into a PyErr_UnicodeWarning.
6216
6217 */
6218 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6219 return NULL;
6220 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006221 if (PyErr_Warn(PyExc_UnicodeWarning,
6222 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006223 "Unicode equal comparison "
6224 "failed to convert both arguments to Unicode - "
6225 "interpreting them as being unequal" :
6226 "Unicode unequal comparison "
6227 "failed to convert both arguments to Unicode - "
6228 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006229 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006230 return NULL;
6231 result = (op == Py_NE);
6232 return PyBool_FromLong(result);
6233}
6234
Guido van Rossum403d68b2000-03-13 15:55:09 +00006235int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006236 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006237{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006238 PyObject *str, *sub;
6239 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006240
6241 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006242 sub = PyUnicode_FromObject(element);
6243 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006244 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006245 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006246
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006247 str = PyUnicode_FromObject(container);
6248 if (!str) {
6249 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006250 return -1;
6251 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006252
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006253 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006254
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006255 Py_DECREF(str);
6256 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006257
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006258 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006259}
6260
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261/* Concat to string or Unicode object giving a new Unicode object. */
6262
6263PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006264 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265{
6266 PyUnicodeObject *u = NULL, *v = NULL, *w;
6267
6268 /* Coerce the two arguments */
6269 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6270 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006271 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6273 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006274 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275
6276 /* Shortcuts */
6277 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006278 Py_DECREF(v);
6279 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 }
6281 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006282 Py_DECREF(u);
6283 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 }
6285
6286 /* Concat the two Unicode strings */
6287 w = _PyUnicode_New(u->length + v->length);
6288 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006289 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 Py_UNICODE_COPY(w->str, u->str, u->length);
6291 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6292
6293 Py_DECREF(u);
6294 Py_DECREF(v);
6295 return (PyObject *)w;
6296
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006297 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 Py_XDECREF(u);
6299 Py_XDECREF(v);
6300 return NULL;
6301}
6302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006303PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006304 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006306Return the number of non-overlapping occurrences of substring sub in\n\
6307Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006308interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309
6310static PyObject *
6311unicode_count(PyUnicodeObject *self, PyObject *args)
6312{
6313 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006314 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006315 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 PyObject *result;
6317
Jesus Cea44e81682011-04-20 16:39:15 +02006318 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6319 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006320 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006321
Antoine Pitrou64672132010-01-13 07:55:48 +00006322 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006323 result = PyInt_FromSsize_t(
6324 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006325 substring->str, substring->length,
6326 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006327 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328
6329 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006330
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331 return result;
6332}
6333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006334PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006335 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006337Encodes S using the codec registered for encoding. encoding defaults\n\
6338to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006339handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006340a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6341'xmlcharrefreplace' as well as any other name registered with\n\
6342codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343
6344static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006345unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006347 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 char *encoding = NULL;
6349 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006350 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006351
Benjamin Peterson332d7212009-09-18 21:14:55 +00006352 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6353 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006355 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006356 if (v == NULL)
6357 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006358 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006359 PyErr_Format(PyExc_TypeError,
6360 "encoder did not return a string/unicode object "
6361 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006362 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006363 Py_DECREF(v);
6364 return NULL;
6365 }
6366 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006367
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006368 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006369 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006370}
6371
6372PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006373 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006374\n\
6375Decodes S using the codec registered for encoding. encoding defaults\n\
6376to the default encoding. errors may be given to set a different error\n\
6377handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6378a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006379as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006380able to handle UnicodeDecodeErrors.");
6381
6382static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006383unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006384{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006385 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006386 char *encoding = NULL;
6387 char *errors = NULL;
6388 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006389
Benjamin Peterson332d7212009-09-18 21:14:55 +00006390 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6391 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006392 return NULL;
6393 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006394 if (v == NULL)
6395 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006396 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006397 PyErr_Format(PyExc_TypeError,
6398 "decoder did not return a string/unicode object "
6399 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006400 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006401 Py_DECREF(v);
6402 return NULL;
6403 }
6404 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006405
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006406 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408}
6409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006410PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006411 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412\n\
6413Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006414If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415
6416static PyObject*
6417unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6418{
6419 Py_UNICODE *e;
6420 Py_UNICODE *p;
6421 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006422 Py_UNICODE *qe;
6423 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 PyUnicodeObject *u;
6425 int tabsize = 8;
6426
6427 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006428 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429
Thomas Wouters7e474022000-07-16 12:04:32 +00006430 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006431 i = 0; /* chars up to and including most recent \n or \r */
6432 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6433 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 for (p = self->str; p < e; p++)
6435 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006436 if (tabsize > 0) {
6437 incr = tabsize - (j % tabsize); /* cannot overflow */
6438 if (j > PY_SSIZE_T_MAX - incr)
6439 goto overflow1;
6440 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006441 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006444 if (j > PY_SSIZE_T_MAX - 1)
6445 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 j++;
6447 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006448 if (i > PY_SSIZE_T_MAX - j)
6449 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006451 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 }
6453 }
6454
Guido van Rossum5bdff602008-03-11 21:18:06 +00006455 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006456 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006457
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 /* Second pass: create output string and fill it */
6459 u = _PyUnicode_New(i + j);
6460 if (!u)
6461 return NULL;
6462
Guido van Rossum5bdff602008-03-11 21:18:06 +00006463 j = 0; /* same as in first pass */
6464 q = u->str; /* next output char */
6465 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466
6467 for (p = self->str; p < e; p++)
6468 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006469 if (tabsize > 0) {
6470 i = tabsize - (j % tabsize);
6471 j += i;
6472 while (i--) {
6473 if (q >= qe)
6474 goto overflow2;
6475 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006476 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006477 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006478 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006479 else {
6480 if (q >= qe)
6481 goto overflow2;
6482 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006483 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 if (*p == '\n' || *p == '\r')
6485 j = 0;
6486 }
6487
6488 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006489
6490 overflow2:
6491 Py_DECREF(u);
6492 overflow1:
6493 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495}
6496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006497PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006498 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499\n\
6500Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006501such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502arguments start and end are interpreted as in slice notation.\n\
6503\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006504Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505
6506static PyObject *
6507unicode_find(PyUnicodeObject *self, PyObject *args)
6508{
Jesus Cea44e81682011-04-20 16:39:15 +02006509 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006510 Py_ssize_t start;
6511 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006512 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
Jesus Cea44e81682011-04-20 16:39:15 +02006514 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6515 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006518 result = stringlib_find_slice(
6519 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6520 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6521 start, end
6522 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523
6524 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006525
6526 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527}
6528
6529static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006530unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531{
6532 if (index < 0 || index >= self->length) {
6533 PyErr_SetString(PyExc_IndexError, "string index out of range");
6534 return NULL;
6535 }
6536
6537 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6538}
6539
6540static long
6541unicode_hash(PyUnicodeObject *self)
6542{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006543 /* Since Unicode objects compare equal to their ASCII string
6544 counterparts, they should use the individual character values
6545 as basis for their hash value. This is needed to assure that
6546 strings and Unicode objects behave in the same way as
6547 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548
Martin v. Löwis18e16552006-02-15 17:27:45 +00006549 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006550 register Py_UNICODE *p;
6551 register long x;
6552
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006553#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006554 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006555#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006557 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006558 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006559 /*
6560 We make the hash of the empty string be 0, rather than using
6561 (prefix ^ suffix), since this slightly obfuscates the hash secret
6562 */
6563 if (len == 0) {
6564 self->hash = 0;
6565 return 0;
6566 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006567 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006568 x = _Py_HashSecret.prefix;
6569 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006570 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006571 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006572 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006573 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006574 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006575 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006576 self->hash = x;
6577 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578}
6579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006580PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006581 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006583Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584
6585static PyObject *
6586unicode_index(PyUnicodeObject *self, PyObject *args)
6587{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006588 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006589 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006590 Py_ssize_t start;
6591 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592
Jesus Cea44e81682011-04-20 16:39:15 +02006593 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6594 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006597 result = stringlib_find_slice(
6598 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6599 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6600 start, end
6601 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602
6603 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006604
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 if (result < 0) {
6606 PyErr_SetString(PyExc_ValueError, "substring not found");
6607 return NULL;
6608 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006609
Martin v. Löwis18e16552006-02-15 17:27:45 +00006610 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611}
6612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006613PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006614 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006616Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006617at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618
6619static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006620unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621{
6622 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6623 register const Py_UNICODE *e;
6624 int cased;
6625
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 /* Shortcut for single character strings */
6627 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006628 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006630 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006631 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006632 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006633
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 e = p + PyUnicode_GET_SIZE(self);
6635 cased = 0;
6636 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006637 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006638
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006639 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6640 return PyBool_FromLong(0);
6641 else if (!cased && Py_UNICODE_ISLOWER(ch))
6642 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006644 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645}
6646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006647PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006648 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006650Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006651at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652
6653static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006654unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655{
6656 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6657 register const Py_UNICODE *e;
6658 int cased;
6659
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 /* Shortcut for single character strings */
6661 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006662 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006664 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006665 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006666 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006667
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 e = p + PyUnicode_GET_SIZE(self);
6669 cased = 0;
6670 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006671 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006672
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006673 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6674 return PyBool_FromLong(0);
6675 else if (!cased && Py_UNICODE_ISUPPER(ch))
6676 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006678 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679}
6680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006681PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006682 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006684Return True if S is a titlecased string and there is at least one\n\
6685character in S, i.e. upper- and titlecase characters may only\n\
6686follow uncased characters and lowercase characters only cased ones.\n\
6687Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
6689static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006690unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691{
6692 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6693 register const Py_UNICODE *e;
6694 int cased, previous_is_cased;
6695
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 /* Shortcut for single character strings */
6697 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006698 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6699 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006701 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006702 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006703 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006704
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 e = p + PyUnicode_GET_SIZE(self);
6706 cased = 0;
6707 previous_is_cased = 0;
6708 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006709 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006710
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006711 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6712 if (previous_is_cased)
6713 return PyBool_FromLong(0);
6714 previous_is_cased = 1;
6715 cased = 1;
6716 }
6717 else if (Py_UNICODE_ISLOWER(ch)) {
6718 if (!previous_is_cased)
6719 return PyBool_FromLong(0);
6720 previous_is_cased = 1;
6721 cased = 1;
6722 }
6723 else
6724 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006726 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727}
6728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006729PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006730 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006732Return True if all characters in S are whitespace\n\
6733and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734
6735static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006736unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737{
6738 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6739 register const Py_UNICODE *e;
6740
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 /* Shortcut for single character strings */
6742 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006743 Py_UNICODE_ISSPACE(*p))
6744 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006746 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006747 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006748 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006749
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 e = p + PyUnicode_GET_SIZE(self);
6751 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006752 if (!Py_UNICODE_ISSPACE(*p))
6753 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006755 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756}
6757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006758PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006759 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006760\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006761Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006762and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006763
6764static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006765unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006766{
6767 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6768 register const Py_UNICODE *e;
6769
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006770 /* Shortcut for single character strings */
6771 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006772 Py_UNICODE_ISALPHA(*p))
6773 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006774
6775 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006776 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006777 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006778
6779 e = p + PyUnicode_GET_SIZE(self);
6780 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006781 if (!Py_UNICODE_ISALPHA(*p))
6782 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006783 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006784 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006785}
6786
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006787PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006788 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006789\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006790Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006791and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006792
6793static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006794unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006795{
6796 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6797 register const Py_UNICODE *e;
6798
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006799 /* Shortcut for single character strings */
6800 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006801 Py_UNICODE_ISALNUM(*p))
6802 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006803
6804 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006805 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006806 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006807
6808 e = p + PyUnicode_GET_SIZE(self);
6809 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006810 if (!Py_UNICODE_ISALNUM(*p))
6811 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006812 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006813 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006814}
6815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006816PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006817 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006819Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006820False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821
6822static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006823unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824{
6825 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6826 register const Py_UNICODE *e;
6827
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 /* Shortcut for single character strings */
6829 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006830 Py_UNICODE_ISDECIMAL(*p))
6831 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006833 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006834 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006835 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006836
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 e = p + PyUnicode_GET_SIZE(self);
6838 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006839 if (!Py_UNICODE_ISDECIMAL(*p))
6840 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006842 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843}
6844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006845PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006846 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006848Return True if all characters in S are digits\n\
6849and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850
6851static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006852unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853{
6854 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6855 register const Py_UNICODE *e;
6856
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 /* Shortcut for single character strings */
6858 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006859 Py_UNICODE_ISDIGIT(*p))
6860 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006862 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006863 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006864 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006865
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 e = p + PyUnicode_GET_SIZE(self);
6867 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006868 if (!Py_UNICODE_ISDIGIT(*p))
6869 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006871 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872}
6873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006874PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006875 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006877Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006878False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879
6880static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006881unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882{
6883 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6884 register const Py_UNICODE *e;
6885
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 /* Shortcut for single character strings */
6887 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006888 Py_UNICODE_ISNUMERIC(*p))
6889 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006891 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006892 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006893 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006894
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 e = p + PyUnicode_GET_SIZE(self);
6896 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006897 if (!Py_UNICODE_ISNUMERIC(*p))
6898 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006900 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901}
6902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006903PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006904 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905\n\
6906Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006907iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908
6909static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006910unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006912 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913}
6914
Martin v. Löwis18e16552006-02-15 17:27:45 +00006915static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916unicode_length(PyUnicodeObject *self)
6917{
6918 return self->length;
6919}
6920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006921PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006922 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006924Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006925done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926
6927static PyObject *
6928unicode_ljust(PyUnicodeObject *self, PyObject *args)
6929{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006930 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006931 Py_UNICODE fillchar = ' ';
6932
Martin v. Löwis412fb672006-04-13 06:34:32 +00006933 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 return NULL;
6935
Tim Peters7a29bd52001-09-12 03:03:31 +00006936 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 Py_INCREF(self);
6938 return (PyObject*) self;
6939 }
6940
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006941 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942}
6943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006944PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006945 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006947Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948
6949static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006950unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 return fixup(self, fixlower);
6953}
6954
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006955#define LEFTSTRIP 0
6956#define RIGHTSTRIP 1
6957#define BOTHSTRIP 2
6958
6959/* Arrays indexed by above */
6960static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6961
6962#define STRIPNAME(i) (stripformat[i]+3)
6963
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006964/* externally visible for str.strip(unicode) */
6965PyObject *
6966_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6967{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006968 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6969 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6970 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6971 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6972 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006973
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006974 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006975
Benjamin Peterson857ce152009-01-31 16:29:18 +00006976 i = 0;
6977 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006978 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6979 i++;
6980 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006981 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006982
Benjamin Peterson857ce152009-01-31 16:29:18 +00006983 j = len;
6984 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006985 do {
6986 j--;
6987 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6988 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006989 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006990
Benjamin Peterson857ce152009-01-31 16:29:18 +00006991 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006992 Py_INCREF(self);
6993 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006994 }
6995 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006996 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006997}
6998
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999
7000static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007001do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007003 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7004 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007005
Benjamin Peterson857ce152009-01-31 16:29:18 +00007006 i = 0;
7007 if (striptype != RIGHTSTRIP) {
7008 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7009 i++;
7010 }
7011 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007012
Benjamin Peterson857ce152009-01-31 16:29:18 +00007013 j = len;
7014 if (striptype != LEFTSTRIP) {
7015 do {
7016 j--;
7017 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7018 j++;
7019 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007020
Benjamin Peterson857ce152009-01-31 16:29:18 +00007021 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7022 Py_INCREF(self);
7023 return (PyObject*)self;
7024 }
7025 else
7026 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027}
7028
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007029
7030static PyObject *
7031do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7032{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007033 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007034
Benjamin Peterson857ce152009-01-31 16:29:18 +00007035 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7036 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007037
Benjamin Peterson857ce152009-01-31 16:29:18 +00007038 if (sep != NULL && sep != Py_None) {
7039 if (PyUnicode_Check(sep))
7040 return _PyUnicode_XStrip(self, striptype, sep);
7041 else if (PyString_Check(sep)) {
7042 PyObject *res;
7043 sep = PyUnicode_FromObject(sep);
7044 if (sep==NULL)
7045 return NULL;
7046 res = _PyUnicode_XStrip(self, striptype, sep);
7047 Py_DECREF(sep);
7048 return res;
7049 }
7050 else {
7051 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007052 "%s arg must be None, unicode or str",
7053 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007054 return NULL;
7055 }
7056 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007057
Benjamin Peterson857ce152009-01-31 16:29:18 +00007058 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007059}
7060
7061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007062PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007063 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007064\n\
7065Return a copy of the string S with leading and trailing\n\
7066whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007067If chars is given and not None, remove characters in chars instead.\n\
7068If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007069
7070static PyObject *
7071unicode_strip(PyUnicodeObject *self, PyObject *args)
7072{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007073 if (PyTuple_GET_SIZE(args) == 0)
7074 return do_strip(self, BOTHSTRIP); /* Common case */
7075 else
7076 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007077}
7078
7079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007080PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007081 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007082\n\
7083Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007084If chars is given and not None, remove characters in chars instead.\n\
7085If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086
7087static PyObject *
7088unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7089{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007090 if (PyTuple_GET_SIZE(args) == 0)
7091 return do_strip(self, LEFTSTRIP); /* Common case */
7092 else
7093 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007094}
7095
7096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007097PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007098 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007099\n\
7100Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007101If chars is given and not None, remove characters in chars instead.\n\
7102If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007103
7104static PyObject *
7105unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7106{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007107 if (PyTuple_GET_SIZE(args) == 0)
7108 return do_strip(self, RIGHTSTRIP); /* Common case */
7109 else
7110 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007111}
7112
7113
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007115unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116{
7117 PyUnicodeObject *u;
7118 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007119 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007120 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121
7122 if (len < 0)
7123 len = 0;
7124
Tim Peters7a29bd52001-09-12 03:03:31 +00007125 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126 /* no repeat, return original string */
7127 Py_INCREF(str);
7128 return (PyObject*) str;
7129 }
Tim Peters8f422462000-09-09 06:13:41 +00007130
7131 /* ensure # of chars needed doesn't overflow int and # of bytes
7132 * needed doesn't overflow size_t
7133 */
7134 nchars = len * str->length;
7135 if (len && nchars / len != str->length) {
7136 PyErr_SetString(PyExc_OverflowError,
7137 "repeated string is too long");
7138 return NULL;
7139 }
7140 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7141 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7142 PyErr_SetString(PyExc_OverflowError,
7143 "repeated string is too long");
7144 return NULL;
7145 }
7146 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 if (!u)
7148 return NULL;
7149
7150 p = u->str;
7151
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007152 if (str->length == 1 && len > 0) {
7153 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007154 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007155 Py_ssize_t done = 0; /* number of characters copied this far */
7156 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007157 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007158 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007159 }
7160 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007161 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007162 Py_UNICODE_COPY(p+done, p, n);
7163 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007164 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007165 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166
7167 return (PyObject*) u;
7168}
7169
7170PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007171 PyObject *subobj,
7172 PyObject *replobj,
7173 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174{
7175 PyObject *self;
7176 PyObject *str1;
7177 PyObject *str2;
7178 PyObject *result;
7179
7180 self = PyUnicode_FromObject(obj);
7181 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007182 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 str1 = PyUnicode_FromObject(subobj);
7184 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007185 Py_DECREF(self);
7186 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 }
7188 str2 = PyUnicode_FromObject(replobj);
7189 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007190 Py_DECREF(self);
7191 Py_DECREF(str1);
7192 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 }
Tim Petersced69f82003-09-16 20:30:58 +00007194 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007195 (PyUnicodeObject *)str1,
7196 (PyUnicodeObject *)str2,
7197 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 Py_DECREF(self);
7199 Py_DECREF(str1);
7200 Py_DECREF(str2);
7201 return result;
7202}
7203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007204PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007205 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206\n\
7207Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007208old replaced by new. If the optional argument count is\n\
7209given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210
7211static PyObject*
7212unicode_replace(PyUnicodeObject *self, PyObject *args)
7213{
7214 PyUnicodeObject *str1;
7215 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007216 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217 PyObject *result;
7218
Martin v. Löwis18e16552006-02-15 17:27:45 +00007219 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 return NULL;
7221 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7222 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007225 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007226 Py_DECREF(str1);
7227 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229
7230 result = replace(self, str1, str2, maxcount);
7231
7232 Py_DECREF(str1);
7233 Py_DECREF(str2);
7234 return result;
7235}
7236
7237static
7238PyObject *unicode_repr(PyObject *unicode)
7239{
7240 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007241 PyUnicode_GET_SIZE(unicode),
7242 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243}
7244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007245PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007246 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247\n\
7248Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007249such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250arguments start and end are interpreted as in slice notation.\n\
7251\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007252Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253
7254static PyObject *
7255unicode_rfind(PyUnicodeObject *self, PyObject *args)
7256{
Jesus Cea44e81682011-04-20 16:39:15 +02007257 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007258 Py_ssize_t start;
7259 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007260 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261
Jesus Cea44e81682011-04-20 16:39:15 +02007262 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7263 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007266 result = stringlib_rfind_slice(
7267 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7268 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7269 start, end
7270 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271
7272 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007273
7274 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275}
7276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007277PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007278 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007280Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281
7282static PyObject *
7283unicode_rindex(PyUnicodeObject *self, PyObject *args)
7284{
Jesus Cea44e81682011-04-20 16:39:15 +02007285 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007286 Py_ssize_t start;
7287 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007288 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289
Jesus Cea44e81682011-04-20 16:39:15 +02007290 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7291 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007294 result = stringlib_rfind_slice(
7295 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7296 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7297 start, end
7298 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299
7300 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007301
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 if (result < 0) {
7303 PyErr_SetString(PyExc_ValueError, "substring not found");
7304 return NULL;
7305 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007306 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307}
7308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007309PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007310 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007312Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007313done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314
7315static PyObject *
7316unicode_rjust(PyUnicodeObject *self, PyObject *args)
7317{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007318 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007319 Py_UNICODE fillchar = ' ';
7320
Martin v. Löwis412fb672006-04-13 06:34:32 +00007321 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322 return NULL;
7323
Tim Peters7a29bd52001-09-12 03:03:31 +00007324 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325 Py_INCREF(self);
7326 return (PyObject*) self;
7327 }
7328
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007329 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330}
7331
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007333unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334{
7335 /* standard clamping */
7336 if (start < 0)
7337 start = 0;
7338 if (end < 0)
7339 end = 0;
7340 if (end > self->length)
7341 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007342 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 /* full slice, return original string */
7344 Py_INCREF(self);
7345 return (PyObject*) self;
7346 }
7347 if (start > end)
7348 start = end;
7349 /* copy slice */
7350 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007351 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352}
7353
7354PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007355 PyObject *sep,
7356 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357{
7358 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007359
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 s = PyUnicode_FromObject(s);
7361 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007362 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007363 if (sep != NULL) {
7364 sep = PyUnicode_FromObject(sep);
7365 if (sep == NULL) {
7366 Py_DECREF(s);
7367 return NULL;
7368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369 }
7370
7371 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7372
7373 Py_DECREF(s);
7374 Py_XDECREF(sep);
7375 return result;
7376}
7377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007378PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007379 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380\n\
7381Return a list of the words in S, using sep as the\n\
7382delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007383splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007384whitespace string is a separator and empty strings are\n\
7385removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386
7387static PyObject*
7388unicode_split(PyUnicodeObject *self, PyObject *args)
7389{
7390 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007391 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392
Martin v. Löwis18e16552006-02-15 17:27:45 +00007393 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394 return NULL;
7395
7396 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007397 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007399 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007401 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402}
7403
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007404PyObject *
7405PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7406{
7407 PyObject* str_obj;
7408 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007409 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007410
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007411 str_obj = PyUnicode_FromObject(str_in);
7412 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007413 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007414 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007415 if (!sep_obj) {
7416 Py_DECREF(str_obj);
7417 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007418 }
7419
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007420 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007421 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7422 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7423 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007424
Fredrik Lundhb9479482006-05-26 17:22:38 +00007425 Py_DECREF(sep_obj);
7426 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007427
7428 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007429}
7430
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007431
7432PyObject *
7433PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7434{
7435 PyObject* str_obj;
7436 PyObject* sep_obj;
7437 PyObject* out;
7438
7439 str_obj = PyUnicode_FromObject(str_in);
7440 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007441 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007442 sep_obj = PyUnicode_FromObject(sep_in);
7443 if (!sep_obj) {
7444 Py_DECREF(str_obj);
7445 return NULL;
7446 }
7447
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007448 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007449 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7450 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7451 );
7452
7453 Py_DECREF(sep_obj);
7454 Py_DECREF(str_obj);
7455
7456 return out;
7457}
7458
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007459PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007460 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007461\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007462Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007463the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007464found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007465
7466static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007467unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007468{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007469 return PyUnicode_Partition((PyObject *)self, separator);
7470}
7471
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007472PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007473 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007474\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007475Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007476the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007477separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007478
7479static PyObject*
7480unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7481{
7482 return PyUnicode_RPartition((PyObject *)self, separator);
7483}
7484
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007485PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007486 PyObject *sep,
7487 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007488{
7489 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007490
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007491 s = PyUnicode_FromObject(s);
7492 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007493 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007494 if (sep != NULL) {
7495 sep = PyUnicode_FromObject(sep);
7496 if (sep == NULL) {
7497 Py_DECREF(s);
7498 return NULL;
7499 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007500 }
7501
7502 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7503
7504 Py_DECREF(s);
7505 Py_XDECREF(sep);
7506 return result;
7507}
7508
7509PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007510 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007511\n\
7512Return a list of the words in S, using sep as the\n\
7513delimiter string, starting at the end of the string and\n\
7514working to the front. If maxsplit is given, at most maxsplit\n\
7515splits are done. If sep is not specified, any whitespace string\n\
7516is a separator.");
7517
7518static PyObject*
7519unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7520{
7521 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007522 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007523
Martin v. Löwis18e16552006-02-15 17:27:45 +00007524 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007525 return NULL;
7526
7527 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007528 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007529 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007530 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007531 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007532 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007533}
7534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007535PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007536 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537\n\
7538Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007539Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007540is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541
7542static PyObject*
7543unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7544{
Guido van Rossum86662912000-04-11 15:38:46 +00007545 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546
Guido van Rossum86662912000-04-11 15:38:46 +00007547 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548 return NULL;
7549
Guido van Rossum86662912000-04-11 15:38:46 +00007550 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551}
7552
7553static
7554PyObject *unicode_str(PyUnicodeObject *self)
7555{
Fred Drakee4315f52000-05-09 19:53:39 +00007556 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557}
7558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007559PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007560 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561\n\
7562Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007563and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564
7565static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007566unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 return fixup(self, fixswapcase);
7569}
7570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007571PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007572 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573\n\
7574Return a copy of the string S, where all characters have been mapped\n\
7575through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007576Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7577Unmapped characters are left untouched. Characters mapped to None\n\
7578are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579
7580static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007581unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582{
Tim Petersced69f82003-09-16 20:30:58 +00007583 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007584 self->length,
7585 table,
7586 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587}
7588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007589PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007590 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007592Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593
7594static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007595unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 return fixup(self, fixupper);
7598}
7599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007600PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007601 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602\n\
Georg Brandl98064072008-09-09 19:26:00 +00007603Pad a numeric string S with zeros on the left, to fill a field\n\
7604of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605
7606static PyObject *
7607unicode_zfill(PyUnicodeObject *self, PyObject *args)
7608{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007609 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610 PyUnicodeObject *u;
7611
Martin v. Löwis18e16552006-02-15 17:27:45 +00007612 Py_ssize_t width;
7613 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614 return NULL;
7615
7616 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007617 if (PyUnicode_CheckExact(self)) {
7618 Py_INCREF(self);
7619 return (PyObject*) self;
7620 }
7621 else
7622 return PyUnicode_FromUnicode(
7623 PyUnicode_AS_UNICODE(self),
7624 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007625 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626 }
7627
7628 fill = width - self->length;
7629
7630 u = pad(self, fill, 0, '0');
7631
Walter Dörwald068325e2002-04-15 13:36:47 +00007632 if (u == NULL)
7633 return NULL;
7634
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635 if (u->str[fill] == '+' || u->str[fill] == '-') {
7636 /* move sign to beginning of string */
7637 u->str[0] = u->str[fill];
7638 u->str[fill] = '0';
7639 }
7640
7641 return (PyObject*) u;
7642}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643
7644#if 0
7645static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007646free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007648 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649}
7650#endif
7651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007652PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007653 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007655Return True if S starts with the specified prefix, False otherwise.\n\
7656With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007657With optional end, stop comparing S at that position.\n\
7658prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659
7660static PyObject *
7661unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007662 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663{
Georg Brandl24250812006-06-09 18:45:48 +00007664 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007666 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007667 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007668 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669
Jesus Cea44e81682011-04-20 16:39:15 +02007670 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007671 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007672 if (PyTuple_Check(subobj)) {
7673 Py_ssize_t i;
7674 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7675 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007676 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007677 if (substring == NULL)
7678 return NULL;
7679 result = tailmatch(self, substring, start, end, -1);
7680 Py_DECREF(substring);
7681 if (result) {
7682 Py_RETURN_TRUE;
7683 }
7684 }
7685 /* nothing matched */
7686 Py_RETURN_FALSE;
7687 }
7688 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007689 if (substring == NULL) {
7690 if (PyErr_ExceptionMatches(PyExc_TypeError))
7691 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7692 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007693 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007694 }
Georg Brandl24250812006-06-09 18:45:48 +00007695 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007697 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698}
7699
7700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007701PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007702 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007704Return True if S ends with the specified suffix, False otherwise.\n\
7705With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007706With optional end, stop comparing S at that position.\n\
7707suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708
7709static PyObject *
7710unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007711 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712{
Georg Brandl24250812006-06-09 18:45:48 +00007713 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007715 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007716 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007717 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
Jesus Cea44e81682011-04-20 16:39:15 +02007719 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007720 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007721 if (PyTuple_Check(subobj)) {
7722 Py_ssize_t i;
7723 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7724 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007725 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007726 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007727 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007728 result = tailmatch(self, substring, start, end, +1);
7729 Py_DECREF(substring);
7730 if (result) {
7731 Py_RETURN_TRUE;
7732 }
7733 }
7734 Py_RETURN_FALSE;
7735 }
7736 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007737 if (substring == NULL) {
7738 if (PyErr_ExceptionMatches(PyExc_TypeError))
7739 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7740 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007741 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007742 }
Georg Brandl24250812006-06-09 18:45:48 +00007743 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007745 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746}
7747
7748
Eric Smitha9f7d622008-02-17 19:46:49 +00007749/* Implements do_string_format, which is unicode because of stringlib */
7750#include "stringlib/string_format.h"
7751
7752PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007753 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007754\n\
Eric Smith6c840852010-11-06 19:43:44 +00007755Return a formatted version of S, using substitutions from args and kwargs.\n\
7756The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007757
Eric Smithdc13b792008-05-30 18:10:04 +00007758static PyObject *
7759unicode__format__(PyObject *self, PyObject *args)
7760{
7761 PyObject *format_spec;
7762 PyObject *result = NULL;
7763 PyObject *tmp = NULL;
7764
7765 /* If 2.x, convert format_spec to the same type as value */
7766 /* This is to allow things like u''.format('') */
7767 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7768 goto done;
7769 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7770 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007771 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007772 goto done;
7773 }
7774 tmp = PyObject_Unicode(format_spec);
7775 if (tmp == NULL)
7776 goto done;
7777 format_spec = tmp;
7778
7779 result = _PyUnicode_FormatAdvanced(self,
7780 PyUnicode_AS_UNICODE(format_spec),
7781 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007782 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007783 Py_XDECREF(tmp);
7784 return result;
7785}
7786
Eric Smitha9f7d622008-02-17 19:46:49 +00007787PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007788 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007789\n\
Eric Smith6c840852010-11-06 19:43:44 +00007790Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007791
Robert Schuppenies901c9972008-06-10 10:10:31 +00007792static PyObject *
7793unicode__sizeof__(PyUnicodeObject *v)
7794{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007795 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7796 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007797}
7798
7799PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007800 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007801\n\
7802");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007803
7804static PyObject *
7805unicode_getnewargs(PyUnicodeObject *v)
7806{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007807 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007808}
7809
7810
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007812 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007813 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7814 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007815 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007816 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7817 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7818 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7819 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7820 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7821 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7822 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007823 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007824 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7825 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7826 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007827 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007828 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007829/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7830 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7831 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7832 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007833 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007834 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007835 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007836 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007837 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7838 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7839 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7840 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7841 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7842 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7843 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7844 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7845 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7846 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7847 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7848 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7849 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7850 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007851 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007852 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7853 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7854 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7855 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007856 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007857#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007858 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859#endif
7860
7861#if 0
7862 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007863 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864#endif
7865
Benjamin Peterson857ce152009-01-31 16:29:18 +00007866 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 {NULL, NULL}
7868};
7869
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007870static PyObject *
7871unicode_mod(PyObject *v, PyObject *w)
7872{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007873 if (!PyUnicode_Check(v)) {
7874 Py_INCREF(Py_NotImplemented);
7875 return Py_NotImplemented;
7876 }
7877 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007878}
7879
7880static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007881 0, /*nb_add*/
7882 0, /*nb_subtract*/
7883 0, /*nb_multiply*/
7884 0, /*nb_divide*/
7885 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007886};
7887
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007889 (lenfunc) unicode_length, /* sq_length */
7890 PyUnicode_Concat, /* sq_concat */
7891 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7892 (ssizeargfunc) unicode_getitem, /* sq_item */
7893 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7894 0, /* sq_ass_item */
7895 0, /* sq_ass_slice */
7896 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897};
7898
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007899static PyObject*
7900unicode_subscript(PyUnicodeObject* self, PyObject* item)
7901{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007902 if (PyIndex_Check(item)) {
7903 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007904 if (i == -1 && PyErr_Occurred())
7905 return NULL;
7906 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007907 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007908 return unicode_getitem(self, i);
7909 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007910 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007911 Py_UNICODE* source_buf;
7912 Py_UNICODE* result_buf;
7913 PyObject* result;
7914
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007915 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007916 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007917 return NULL;
7918 }
7919
7920 if (slicelength <= 0) {
7921 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007922 } else if (start == 0 && step == 1 && slicelength == self->length &&
7923 PyUnicode_CheckExact(self)) {
7924 Py_INCREF(self);
7925 return (PyObject *)self;
7926 } else if (step == 1) {
7927 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007928 } else {
7929 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007930 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7931 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007932
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007933 if (result_buf == NULL)
7934 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007935
7936 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7937 result_buf[i] = source_buf[cur];
7938 }
Tim Petersced69f82003-09-16 20:30:58 +00007939
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007940 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007941 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007942 return result;
7943 }
7944 } else {
7945 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7946 return NULL;
7947 }
7948}
7949
7950static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007951 (lenfunc)unicode_length, /* mp_length */
7952 (binaryfunc)unicode_subscript, /* mp_subscript */
7953 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007954};
7955
Martin v. Löwis18e16552006-02-15 17:27:45 +00007956static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007958 Py_ssize_t index,
7959 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960{
7961 if (index != 0) {
7962 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007963 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 return -1;
7965 }
7966 *ptr = (void *) self->str;
7967 return PyUnicode_GET_DATA_SIZE(self);
7968}
7969
Martin v. Löwis18e16552006-02-15 17:27:45 +00007970static Py_ssize_t
7971unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007972 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973{
7974 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007975 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 return -1;
7977}
7978
7979static int
7980unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007981 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982{
7983 if (lenp)
7984 *lenp = PyUnicode_GET_DATA_SIZE(self);
7985 return 1;
7986}
7987
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007988static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007990 Py_ssize_t index,
7991 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992{
7993 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007994
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 if (index != 0) {
7996 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007997 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 return -1;
7999 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008000 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008002 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008003 *ptr = (void *) PyString_AS_STRING(str);
8004 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005}
8006
8007/* Helpers for PyUnicode_Format() */
8008
8009static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008010getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008012 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008014 (*p_argidx)++;
8015 if (arglen < 0)
8016 return args;
8017 else
8018 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 }
8020 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008021 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 return NULL;
8023}
8024
8025#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008026#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008028#define F_ALT (1<<3)
8029#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030
Martin v. Löwis18e16552006-02-15 17:27:45 +00008031static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008032strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008034 register Py_ssize_t i;
8035 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008037 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 return len;
8040}
8041
Neal Norwitzfc76d632006-01-10 06:03:13 +00008042static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008043longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8044{
Tim Peters15231542006-02-16 01:08:01 +00008045 Py_ssize_t result;
8046
Neal Norwitzfc76d632006-01-10 06:03:13 +00008047 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008048 result = strtounicode(buffer, (char *)buffer);
8049 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008050}
8051
Guido van Rossum078151d2002-08-11 04:24:12 +00008052/* XXX To save some code duplication, formatfloat/long/int could have been
8053 shared with stringobject.c, converting from 8-bit to Unicode after the
8054 formatting is done. */
8055
Mark Dickinson18cfada2009-11-23 18:46:41 +00008056/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8057
8058static PyObject *
8059formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008061 char *p;
8062 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008064
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 x = PyFloat_AsDouble(v);
8066 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008067 return NULL;
8068
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008070 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008071
Mark Dickinson18cfada2009-11-23 18:46:41 +00008072 p = PyOS_double_to_string(x, type, prec,
8073 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8074 if (p == NULL)
8075 return NULL;
8076 result = PyUnicode_FromStringAndSize(p, strlen(p));
8077 PyMem_Free(p);
8078 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079}
8080
Tim Peters38fd5b62000-09-21 05:43:11 +00008081static PyObject*
8082formatlong(PyObject *val, int flags, int prec, int type)
8083{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008084 char *buf;
8085 int i, len;
8086 PyObject *str; /* temporary string object. */
8087 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008088
Benjamin Peterson857ce152009-01-31 16:29:18 +00008089 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8090 if (!str)
8091 return NULL;
8092 result = _PyUnicode_New(len);
8093 if (!result) {
8094 Py_DECREF(str);
8095 return NULL;
8096 }
8097 for (i = 0; i < len; i++)
8098 result->str[i] = buf[i];
8099 result->str[len] = 0;
8100 Py_DECREF(str);
8101 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008102}
8103
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104static int
8105formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008106 size_t buflen,
8107 int flags,
8108 int prec,
8109 int type,
8110 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008112 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008113 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8114 * + 1 + 1
8115 * = 24
8116 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008117 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008118 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 long x;
8120
8121 x = PyInt_AsLong(v);
8122 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008123 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008124 if (x < 0 && type == 'u') {
8125 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008126 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008127 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8128 sign = "-";
8129 else
8130 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008132 prec = 1;
8133
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008134 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8135 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008136 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008137 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008138 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008139 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008140 return -1;
8141 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008142
8143 if ((flags & F_ALT) &&
8144 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008145 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008146 * of issues that cause pain:
8147 * - when 0 is being converted, the C standard leaves off
8148 * the '0x' or '0X', which is inconsistent with other
8149 * %#x/%#X conversions and inconsistent with Python's
8150 * hex() function
8151 * - there are platforms that violate the standard and
8152 * convert 0 with the '0x' or '0X'
8153 * (Metrowerks, Compaq Tru64)
8154 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008155 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008156 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008157 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008158 * We can achieve the desired consistency by inserting our
8159 * own '0x' or '0X' prefix, and substituting %x/%X in place
8160 * of %#x/%#X.
8161 *
8162 * Note that this is the same approach as used in
8163 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008164 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008165 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8166 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008167 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008168 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008169 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8170 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008171 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008172 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008173 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008174 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008175 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008176 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177}
8178
8179static int
8180formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008181 size_t buflen,
8182 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183{
Ezio Melotti32125152010-02-25 17:36:04 +00008184 PyObject *unistr;
8185 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008186 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008187 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008188 if (PyUnicode_GET_SIZE(v) != 1)
8189 goto onError;
8190 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008193 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008194 if (PyString_GET_SIZE(v) != 1)
8195 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008196 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8197 with a UnicodeDecodeError if 'char' is not decodable with the
8198 default encoding (usually ASCII, but it might be something else) */
8199 str = PyString_AS_STRING(v);
8200 if ((unsigned char)str[0] > 0x7F) {
8201 /* the char is not ASCII; try to decode the string using the
8202 default encoding and return -1 to let the UnicodeDecodeError
8203 be raised if the string can't be decoded */
8204 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8205 if (unistr == NULL)
8206 return -1;
8207 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8208 Py_DECREF(unistr);
8209 }
8210 else
8211 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213
8214 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008215 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008217 x = PyInt_AsLong(v);
8218 if (x == -1 && PyErr_Occurred())
8219 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008220#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008221 if (x < 0 || x > 0x10ffff) {
8222 PyErr_SetString(PyExc_OverflowError,
8223 "%c arg not in range(0x110000) "
8224 "(wide Python build)");
8225 return -1;
8226 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008227#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008228 if (x < 0 || x > 0xffff) {
8229 PyErr_SetString(PyExc_OverflowError,
8230 "%c arg not in range(0x10000) "
8231 "(narrow Python build)");
8232 return -1;
8233 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008234#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008235 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 }
8237 buf[1] = '\0';
8238 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008239
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008240 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008241 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008242 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008243 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244}
8245
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008246/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8247
Mark Dickinson18cfada2009-11-23 18:46:41 +00008248 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008249 chars are formatted. XXX This is a magic number. Each formatting
8250 routine does bounds checking to ensure no overflow, but a better
8251 solution may be to malloc a buffer of appropriate size for each
8252 format. For now, the current solution is sufficient.
8253*/
8254#define FORMATBUFLEN (size_t)120
8255
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008257 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258{
8259 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008260 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 int args_owned = 0;
8262 PyUnicodeObject *result = NULL;
8263 PyObject *dict = NULL;
8264 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008265
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008267 PyErr_BadInternalCall();
8268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 }
8270 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008271 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273 fmt = PyUnicode_AS_UNICODE(uformat);
8274 fmtcnt = PyUnicode_GET_SIZE(uformat);
8275
8276 reslen = rescnt = fmtcnt + 100;
8277 result = _PyUnicode_New(reslen);
8278 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008279 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 res = PyUnicode_AS_UNICODE(result);
8281
8282 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008283 arglen = PyTuple_Size(args);
8284 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 }
8286 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008287 arglen = -1;
8288 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 }
Benjamin Peterson23d49d32012-08-28 17:55:35 -04008290 if (PyMapping_Check(args) && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008291 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008292 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293
8294 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008295 if (*fmt != '%') {
8296 if (--rescnt < 0) {
8297 rescnt = fmtcnt + 100;
8298 reslen += rescnt;
8299 if (_PyUnicode_Resize(&result, reslen) < 0)
8300 goto onError;
8301 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8302 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008303 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008304 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008305 }
8306 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008307 /* Got a format specifier */
8308 int flags = 0;
8309 Py_ssize_t width = -1;
8310 int prec = -1;
8311 Py_UNICODE c = '\0';
8312 Py_UNICODE fill;
8313 int isnumok;
8314 PyObject *v = NULL;
8315 PyObject *temp = NULL;
8316 Py_UNICODE *pbuf;
8317 Py_UNICODE sign;
8318 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008319 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008320
8321 fmt++;
8322 if (*fmt == '(') {
8323 Py_UNICODE *keystart;
8324 Py_ssize_t keylen;
8325 PyObject *key;
8326 int pcount = 1;
8327
8328 if (dict == NULL) {
8329 PyErr_SetString(PyExc_TypeError,
8330 "format requires a mapping");
8331 goto onError;
8332 }
8333 ++fmt;
8334 --fmtcnt;
8335 keystart = fmt;
8336 /* Skip over balanced parentheses */
8337 while (pcount > 0 && --fmtcnt >= 0) {
8338 if (*fmt == ')')
8339 --pcount;
8340 else if (*fmt == '(')
8341 ++pcount;
8342 fmt++;
8343 }
8344 keylen = fmt - keystart - 1;
8345 if (fmtcnt < 0 || pcount > 0) {
8346 PyErr_SetString(PyExc_ValueError,
8347 "incomplete format key");
8348 goto onError;
8349 }
8350#if 0
8351 /* keys are converted to strings using UTF-8 and
8352 then looked up since Python uses strings to hold
8353 variables names etc. in its namespaces and we
8354 wouldn't want to break common idioms. */
8355 key = PyUnicode_EncodeUTF8(keystart,
8356 keylen,
8357 NULL);
8358#else
8359 key = PyUnicode_FromUnicode(keystart, keylen);
8360#endif
8361 if (key == NULL)
8362 goto onError;
8363 if (args_owned) {
8364 Py_DECREF(args);
8365 args_owned = 0;
8366 }
8367 args = PyObject_GetItem(dict, key);
8368 Py_DECREF(key);
8369 if (args == NULL) {
8370 goto onError;
8371 }
8372 args_owned = 1;
8373 arglen = -1;
8374 argidx = -2;
8375 }
8376 while (--fmtcnt >= 0) {
8377 switch (c = *fmt++) {
8378 case '-': flags |= F_LJUST; continue;
8379 case '+': flags |= F_SIGN; continue;
8380 case ' ': flags |= F_BLANK; continue;
8381 case '#': flags |= F_ALT; continue;
8382 case '0': flags |= F_ZERO; continue;
8383 }
8384 break;
8385 }
8386 if (c == '*') {
8387 v = getnextarg(args, arglen, &argidx);
8388 if (v == NULL)
8389 goto onError;
8390 if (!PyInt_Check(v)) {
8391 PyErr_SetString(PyExc_TypeError,
8392 "* wants int");
8393 goto onError;
8394 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008395 width = PyInt_AsSsize_t(v);
8396 if (width == -1 && PyErr_Occurred())
8397 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008398 if (width < 0) {
8399 flags |= F_LJUST;
8400 width = -width;
8401 }
8402 if (--fmtcnt >= 0)
8403 c = *fmt++;
8404 }
8405 else if (c >= '0' && c <= '9') {
8406 width = c - '0';
8407 while (--fmtcnt >= 0) {
8408 c = *fmt++;
8409 if (c < '0' || c > '9')
8410 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008411 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008412 PyErr_SetString(PyExc_ValueError,
8413 "width too big");
8414 goto onError;
8415 }
8416 width = width*10 + (c - '0');
8417 }
8418 }
8419 if (c == '.') {
8420 prec = 0;
8421 if (--fmtcnt >= 0)
8422 c = *fmt++;
8423 if (c == '*') {
8424 v = getnextarg(args, arglen, &argidx);
8425 if (v == NULL)
8426 goto onError;
8427 if (!PyInt_Check(v)) {
8428 PyErr_SetString(PyExc_TypeError,
8429 "* wants int");
8430 goto onError;
8431 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008432 prec = _PyInt_AsInt(v);
8433 if (prec == -1 && PyErr_Occurred())
8434 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008435 if (prec < 0)
8436 prec = 0;
8437 if (--fmtcnt >= 0)
8438 c = *fmt++;
8439 }
8440 else if (c >= '0' && c <= '9') {
8441 prec = c - '0';
8442 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008443 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008444 if (c < '0' || c > '9')
8445 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008446 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008447 PyErr_SetString(PyExc_ValueError,
8448 "prec too big");
8449 goto onError;
8450 }
8451 prec = prec*10 + (c - '0');
8452 }
8453 }
8454 } /* prec */
8455 if (fmtcnt >= 0) {
8456 if (c == 'h' || c == 'l' || c == 'L') {
8457 if (--fmtcnt >= 0)
8458 c = *fmt++;
8459 }
8460 }
8461 if (fmtcnt < 0) {
8462 PyErr_SetString(PyExc_ValueError,
8463 "incomplete format");
8464 goto onError;
8465 }
8466 if (c != '%') {
8467 v = getnextarg(args, arglen, &argidx);
8468 if (v == NULL)
8469 goto onError;
8470 }
8471 sign = 0;
8472 fill = ' ';
8473 switch (c) {
8474
8475 case '%':
8476 pbuf = formatbuf;
8477 /* presume that buffer length is at least 1 */
8478 pbuf[0] = '%';
8479 len = 1;
8480 break;
8481
8482 case 's':
8483 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008484 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008485 temp = v;
8486 Py_INCREF(temp);
8487 }
8488 else {
8489 PyObject *unicode;
8490 if (c == 's')
8491 temp = PyObject_Unicode(v);
8492 else
8493 temp = PyObject_Repr(v);
8494 if (temp == NULL)
8495 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008496 if (PyUnicode_Check(temp))
8497 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008498 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008499 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008500 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8501 PyString_GET_SIZE(temp),
8502 NULL,
8503 "strict");
8504 Py_DECREF(temp);
8505 temp = unicode;
8506 if (temp == NULL)
8507 goto onError;
8508 }
8509 else {
8510 Py_DECREF(temp);
8511 PyErr_SetString(PyExc_TypeError,
8512 "%s argument has non-string str()");
8513 goto onError;
8514 }
8515 }
8516 pbuf = PyUnicode_AS_UNICODE(temp);
8517 len = PyUnicode_GET_SIZE(temp);
8518 if (prec >= 0 && len > prec)
8519 len = prec;
8520 break;
8521
8522 case 'i':
8523 case 'd':
8524 case 'u':
8525 case 'o':
8526 case 'x':
8527 case 'X':
8528 if (c == 'i')
8529 c = 'd';
8530 isnumok = 0;
8531 if (PyNumber_Check(v)) {
8532 PyObject *iobj=NULL;
8533
8534 if (PyInt_Check(v) || (PyLong_Check(v))) {
8535 iobj = v;
8536 Py_INCREF(iobj);
8537 }
8538 else {
8539 iobj = PyNumber_Int(v);
8540 if (iobj==NULL) iobj = PyNumber_Long(v);
8541 }
8542 if (iobj!=NULL) {
8543 if (PyInt_Check(iobj)) {
8544 isnumok = 1;
8545 pbuf = formatbuf;
8546 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8547 flags, prec, c, iobj);
8548 Py_DECREF(iobj);
8549 if (len < 0)
8550 goto onError;
8551 sign = 1;
8552 }
8553 else if (PyLong_Check(iobj)) {
8554 isnumok = 1;
8555 temp = formatlong(iobj, flags, prec, c);
8556 Py_DECREF(iobj);
8557 if (!temp)
8558 goto onError;
8559 pbuf = PyUnicode_AS_UNICODE(temp);
8560 len = PyUnicode_GET_SIZE(temp);
8561 sign = 1;
8562 }
8563 else {
8564 Py_DECREF(iobj);
8565 }
8566 }
8567 }
8568 if (!isnumok) {
8569 PyErr_Format(PyExc_TypeError,
8570 "%%%c format: a number is required, "
8571 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8572 goto onError;
8573 }
8574 if (flags & F_ZERO)
8575 fill = '0';
8576 break;
8577
8578 case 'e':
8579 case 'E':
8580 case 'f':
8581 case 'F':
8582 case 'g':
8583 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008584 temp = formatfloat(v, flags, prec, c);
8585 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008586 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008587 pbuf = PyUnicode_AS_UNICODE(temp);
8588 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008589 sign = 1;
8590 if (flags & F_ZERO)
8591 fill = '0';
8592 break;
8593
8594 case 'c':
8595 pbuf = formatbuf;
8596 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8597 if (len < 0)
8598 goto onError;
8599 break;
8600
8601 default:
8602 PyErr_Format(PyExc_ValueError,
8603 "unsupported format character '%c' (0x%x) "
8604 "at index %zd",
8605 (31<=c && c<=126) ? (char)c : '?',
8606 (int)c,
8607 (Py_ssize_t)(fmt - 1 -
8608 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008609 goto onError;
8610 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008611 if (sign) {
8612 if (*pbuf == '-' || *pbuf == '+') {
8613 sign = *pbuf++;
8614 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008615 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008616 else if (flags & F_SIGN)
8617 sign = '+';
8618 else if (flags & F_BLANK)
8619 sign = ' ';
8620 else
8621 sign = 0;
8622 }
8623 if (width < len)
8624 width = len;
8625 if (rescnt - (sign != 0) < width) {
8626 reslen -= rescnt;
8627 rescnt = width + fmtcnt + 100;
8628 reslen += rescnt;
8629 if (reslen < 0) {
8630 Py_XDECREF(temp);
8631 PyErr_NoMemory();
8632 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008633 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008634 if (_PyUnicode_Resize(&result, reslen) < 0) {
8635 Py_XDECREF(temp);
8636 goto onError;
8637 }
8638 res = PyUnicode_AS_UNICODE(result)
8639 + reslen - rescnt;
8640 }
8641 if (sign) {
8642 if (fill != ' ')
8643 *res++ = sign;
8644 rescnt--;
8645 if (width > len)
8646 width--;
8647 }
8648 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8649 assert(pbuf[0] == '0');
8650 assert(pbuf[1] == c);
8651 if (fill != ' ') {
8652 *res++ = *pbuf++;
8653 *res++ = *pbuf++;
8654 }
8655 rescnt -= 2;
8656 width -= 2;
8657 if (width < 0)
8658 width = 0;
8659 len -= 2;
8660 }
8661 if (width > len && !(flags & F_LJUST)) {
8662 do {
8663 --rescnt;
8664 *res++ = fill;
8665 } while (--width > len);
8666 }
8667 if (fill == ' ') {
8668 if (sign)
8669 *res++ = sign;
8670 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8671 assert(pbuf[0] == '0');
8672 assert(pbuf[1] == c);
8673 *res++ = *pbuf++;
8674 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008675 }
8676 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008677 Py_UNICODE_COPY(res, pbuf, len);
8678 res += len;
8679 rescnt -= len;
8680 while (--width >= len) {
8681 --rescnt;
8682 *res++ = ' ';
8683 }
8684 if (dict && (argidx < arglen) && c != '%') {
8685 PyErr_SetString(PyExc_TypeError,
8686 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008687 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008688 goto onError;
8689 }
8690 Py_XDECREF(temp);
8691 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 } /* until end */
8693 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008694 PyErr_SetString(PyExc_TypeError,
8695 "not all arguments converted during string formatting");
8696 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 }
8698
Thomas Woutersa96affe2006-03-12 00:29:36 +00008699 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008702 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703 }
8704 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 return (PyObject *)result;
8706
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008707 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 Py_XDECREF(result);
8709 Py_DECREF(uformat);
8710 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008711 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 }
8713 return NULL;
8714}
8715
8716static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008717 (readbufferproc) unicode_buffer_getreadbuf,
8718 (writebufferproc) unicode_buffer_getwritebuf,
8719 (segcountproc) unicode_buffer_getsegcount,
8720 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721};
8722
Jeremy Hylton938ace62002-07-17 16:30:39 +00008723static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008724unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8725
Tim Peters6d6c1a32001-08-02 04:15:00 +00008726static PyObject *
8727unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8728{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008729 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008730 static char *kwlist[] = {"string", "encoding", "errors", 0};
8731 char *encoding = NULL;
8732 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008733
Benjamin Peterson857ce152009-01-31 16:29:18 +00008734 if (type != &PyUnicode_Type)
8735 return unicode_subtype_new(type, args, kwds);
8736 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008737 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008738 return NULL;
8739 if (x == NULL)
8740 return (PyObject *)_PyUnicode_New(0);
8741 if (encoding == NULL && errors == NULL)
8742 return PyObject_Unicode(x);
8743 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008744 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008745}
8746
Guido van Rossume023fe02001-08-30 03:12:59 +00008747static PyObject *
8748unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8749{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008750 PyUnicodeObject *tmp, *pnew;
8751 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008752
Benjamin Peterson857ce152009-01-31 16:29:18 +00008753 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8754 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8755 if (tmp == NULL)
8756 return NULL;
8757 assert(PyUnicode_Check(tmp));
8758 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8759 if (pnew == NULL) {
8760 Py_DECREF(tmp);
8761 return NULL;
8762 }
8763 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8764 if (pnew->str == NULL) {
8765 _Py_ForgetReference((PyObject *)pnew);
8766 PyObject_Del(pnew);
8767 Py_DECREF(tmp);
8768 return PyErr_NoMemory();
8769 }
8770 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8771 pnew->length = n;
8772 pnew->hash = tmp->hash;
8773 Py_DECREF(tmp);
8774 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008775}
8776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008777PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008778 "unicode(object='') -> unicode object\n\
8779unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008780\n\
8781Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008782encoding defaults to the current default string encoding.\n\
8783errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008784
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008786 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008787 "unicode", /* tp_name */
8788 sizeof(PyUnicodeObject), /* tp_size */
8789 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008791 (destructor)unicode_dealloc, /* tp_dealloc */
8792 0, /* tp_print */
8793 0, /* tp_getattr */
8794 0, /* tp_setattr */
8795 0, /* tp_compare */
8796 unicode_repr, /* tp_repr */
8797 &unicode_as_number, /* tp_as_number */
8798 &unicode_as_sequence, /* tp_as_sequence */
8799 &unicode_as_mapping, /* tp_as_mapping */
8800 (hashfunc) unicode_hash, /* tp_hash*/
8801 0, /* tp_call*/
8802 (reprfunc) unicode_str, /* tp_str */
8803 PyObject_GenericGetAttr, /* tp_getattro */
8804 0, /* tp_setattro */
8805 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008806 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008807 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008808 unicode_doc, /* tp_doc */
8809 0, /* tp_traverse */
8810 0, /* tp_clear */
8811 PyUnicode_RichCompare, /* tp_richcompare */
8812 0, /* tp_weaklistoffset */
8813 0, /* tp_iter */
8814 0, /* tp_iternext */
8815 unicode_methods, /* tp_methods */
8816 0, /* tp_members */
8817 0, /* tp_getset */
8818 &PyBaseString_Type, /* tp_base */
8819 0, /* tp_dict */
8820 0, /* tp_descr_get */
8821 0, /* tp_descr_set */
8822 0, /* tp_dictoffset */
8823 0, /* tp_init */
8824 0, /* tp_alloc */
8825 unicode_new, /* tp_new */
8826 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827};
8828
8829/* Initialize the Unicode implementation */
8830
Thomas Wouters78890102000-07-22 19:25:51 +00008831void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008833 /* XXX - move this array to unicodectype.c ? */
8834 Py_UNICODE linebreak[] = {
8835 0x000A, /* LINE FEED */
8836 0x000D, /* CARRIAGE RETURN */
8837 0x001C, /* FILE SEPARATOR */
8838 0x001D, /* GROUP SEPARATOR */
8839 0x001E, /* RECORD SEPARATOR */
8840 0x0085, /* NEXT LINE */
8841 0x2028, /* LINE SEPARATOR */
8842 0x2029, /* PARAGRAPH SEPARATOR */
8843 };
8844
Fred Drakee4315f52000-05-09 19:53:39 +00008845 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008846 if (!unicode_empty) {
8847 unicode_empty = _PyUnicode_New(0);
8848 if (!unicode_empty)
8849 return;
8850 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008851
Guido van Rossumcacfc072002-05-24 19:01:59 +00008852 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008853 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008854
8855 /* initialize the linebreak bloom filter */
8856 bloom_linebreak = make_bloom_mask(
8857 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8858 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008859
8860 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008861
8862 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8863 Py_FatalError("Can't initialize field name iterator type");
8864
8865 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8866 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867}
8868
8869/* Finalize the Unicode implementation */
8870
Christian Heimes3b718a72008-02-14 12:47:33 +00008871int
8872PyUnicode_ClearFreeList(void)
8873{
8874 int freelist_size = numfree;
8875 PyUnicodeObject *u;
8876
8877 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008878 PyUnicodeObject *v = u;
8879 u = *(PyUnicodeObject **)u;
8880 if (v->str)
8881 PyObject_DEL(v->str);
8882 Py_XDECREF(v->defenc);
8883 PyObject_Del(v);
8884 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008885 }
8886 free_list = NULL;
8887 assert(numfree == 0);
8888 return freelist_size;
8889}
8890
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891void
Thomas Wouters78890102000-07-22 19:25:51 +00008892_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008894 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008896 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008897
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008898 for (i = 0; i < 256; i++)
8899 Py_CLEAR(unicode_latin1[i]);
8900
Christian Heimes3b718a72008-02-14 12:47:33 +00008901 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008903
Anthony Baxterac6bd462006-04-13 02:06:09 +00008904#ifdef __cplusplus
8905}
8906#endif