blob: 64a5ef557c05754d17fe1863c6251ce172d1c2ed [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550#ifdef HAVE_WCHAR_H
551
Mark Dickinson6b265f12009-03-18 16:07:26 +0000552#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
553# define CONVERT_WCHAR_TO_SURROGATES
554#endif
555
556#ifdef CONVERT_WCHAR_TO_SURROGATES
557
558/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
559 to convert from UTF32 to UTF16. */
560
561PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
562 Py_ssize_t size)
563{
564 PyUnicodeObject *unicode;
565 register Py_ssize_t i;
566 Py_ssize_t alloc;
567 const wchar_t *orig_w;
568
569 if (w == NULL) {
570 PyErr_BadInternalCall();
571 return NULL;
572 }
573
574 alloc = size;
575 orig_w = w;
576 for (i = size; i > 0; i--) {
577 if (*w > 0xFFFF)
578 alloc++;
579 w++;
580 }
581 w = orig_w;
582 unicode = _PyUnicode_New(alloc);
583 if (!unicode)
584 return NULL;
585
586 /* Copy the wchar_t data into the new object */
587 {
588 register Py_UNICODE *u;
589 u = PyUnicode_AS_UNICODE(unicode);
590 for (i = size; i > 0; i--) {
591 if (*w > 0xFFFF) {
592 wchar_t ordinal = *w++;
593 ordinal -= 0x10000;
594 *u++ = 0xD800 | (ordinal >> 10);
595 *u++ = 0xDC00 | (ordinal & 0x3FF);
596 }
597 else
598 *u++ = *w++;
599 }
600 }
601 return (PyObject *)unicode;
602}
603
604#else
605
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000607 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608{
609 PyUnicodeObject *unicode;
610
611 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000612 PyErr_BadInternalCall();
613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 }
615
616 unicode = _PyUnicode_New(size);
617 if (!unicode)
618 return NULL;
619
620 /* Copy the wchar_t data into the new object */
621#ifdef HAVE_USABLE_WCHAR_T
622 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000623#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000625 register Py_UNICODE *u;
626 register Py_ssize_t i;
627 u = PyUnicode_AS_UNICODE(unicode);
628 for (i = size; i > 0; i--)
629 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000630 }
631#endif
632
633 return (PyObject *)unicode;
634}
635
Mark Dickinson6b265f12009-03-18 16:07:26 +0000636#endif /* CONVERT_WCHAR_TO_SURROGATES */
637
638#undef CONVERT_WCHAR_TO_SURROGATES
639
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000640static void
641makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
642{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000643 *fmt++ = '%';
644 if (width) {
645 if (zeropad)
646 *fmt++ = '0';
647 fmt += sprintf(fmt, "%d", width);
648 }
649 if (precision)
650 fmt += sprintf(fmt, ".%d", precision);
651 if (longflag)
652 *fmt++ = 'l';
653 else if (size_tflag) {
654 char *f = PY_FORMAT_SIZE_T;
655 while (*f)
656 *fmt++ = *f++;
657 }
658 *fmt++ = c;
659 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000660}
661
662#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
663
664PyObject *
665PyUnicode_FromFormatV(const char *format, va_list vargs)
666{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000667 va_list count;
668 Py_ssize_t callcount = 0;
669 PyObject **callresults = NULL;
670 PyObject **callresult = NULL;
671 Py_ssize_t n = 0;
672 int width = 0;
673 int precision = 0;
674 int zeropad;
675 const char* f;
676 Py_UNICODE *s;
677 PyObject *string;
678 /* used by sprintf */
679 char buffer[21];
680 /* use abuffer instead of buffer, if we need more space
681 * (which can happen if there's a format specifier with width). */
682 char *abuffer = NULL;
683 char *realbuffer;
684 Py_ssize_t abuffersize = 0;
685 char fmt[60]; /* should be enough for %0width.precisionld */
686 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000687
688#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000689 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000690#else
691#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000692 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000693#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000694 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000695#endif
696#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000697 /* step 1: count the number of %S/%R/%s format specifications
698 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
699 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000700 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000701 if (*f == '%') {
702 if (*(f+1)=='%')
703 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000704 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000705 ++callcount;
706 while (isdigit((unsigned)*f))
707 width = (width*10) + *f++ - '0';
708 while (*++f && *f != '%' && !isalpha((unsigned)*f))
709 ;
710 if (*f == 's')
711 ++callcount;
712 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000713 }
714 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000715 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000716 if (callcount) {
717 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
718 if (!callresults) {
719 PyErr_NoMemory();
720 return NULL;
721 }
722 callresult = callresults;
723 }
724 /* step 3: figure out how large a buffer we need */
725 for (f = format; *f; f++) {
726 if (*f == '%') {
727 const char* p = f;
728 width = 0;
729 while (isdigit((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 while (*++f && *f != '%' && !isalpha((unsigned)*f))
732 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000733
Benjamin Peterson857ce152009-01-31 16:29:18 +0000734 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
735 * they don't affect the amount of space we reserve.
736 */
737 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000738 (f[1] == 'd' || f[1] == 'u'))
739 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000740
Benjamin Peterson857ce152009-01-31 16:29:18 +0000741 switch (*f) {
742 case 'c':
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300743 {
744 int ordinal = va_arg(count, int);
745#ifdef Py_UNICODE_WIDE
746 if (ordinal < 0 || ordinal > 0x10ffff) {
747 PyErr_SetString(PyExc_OverflowError,
748 "%c arg not in range(0x110000) "
749 "(wide Python build)");
750 goto fail;
751 }
752#else
753 if (ordinal < 0 || ordinal > 0xffff) {
754 PyErr_SetString(PyExc_OverflowError,
755 "%c arg not in range(0x10000) "
756 "(narrow Python build)");
757 goto fail;
758 }
759#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000760 /* fall through... */
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300761 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000762 case '%':
763 n++;
764 break;
765 case 'd': case 'u': case 'i': case 'x':
766 (void) va_arg(count, int);
767 /* 20 bytes is enough to hold a 64-bit
768 integer. Decimal takes the most space.
769 This isn't enough for octal.
770 If a width is specified we need more
771 (which we allocate later). */
772 if (width < 20)
773 width = 20;
774 n += width;
775 if (abuffersize < width)
776 abuffersize = width;
777 break;
778 case 's':
779 {
780 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000781 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000782 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
783 if (!str)
784 goto fail;
785 n += PyUnicode_GET_SIZE(str);
786 /* Remember the str and switch to the next slot */
787 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000788 break;
789 }
790 case 'U':
791 {
792 PyObject *obj = va_arg(count, PyObject *);
793 assert(obj && PyUnicode_Check(obj));
794 n += PyUnicode_GET_SIZE(obj);
795 break;
796 }
797 case 'V':
798 {
799 PyObject *obj = va_arg(count, PyObject *);
800 const char *str = va_arg(count, const char *);
801 assert(obj || str);
802 assert(!obj || PyUnicode_Check(obj));
803 if (obj)
804 n += PyUnicode_GET_SIZE(obj);
805 else
806 n += strlen(str);
807 break;
808 }
809 case 'S':
810 {
811 PyObject *obj = va_arg(count, PyObject *);
812 PyObject *str;
813 assert(obj);
814 str = PyObject_Str(obj);
815 if (!str)
816 goto fail;
817 n += PyUnicode_GET_SIZE(str);
818 /* Remember the str and switch to the next slot */
819 *callresult++ = str;
820 break;
821 }
822 case 'R':
823 {
824 PyObject *obj = va_arg(count, PyObject *);
825 PyObject *repr;
826 assert(obj);
827 repr = PyObject_Repr(obj);
828 if (!repr)
829 goto fail;
830 n += PyUnicode_GET_SIZE(repr);
831 /* Remember the repr and switch to the next slot */
832 *callresult++ = repr;
833 break;
834 }
835 case 'p':
836 (void) va_arg(count, int);
837 /* maximum 64-bit pointer representation:
838 * 0xffffffffffffffff
839 * so 19 characters is enough.
840 * XXX I count 18 -- what's the extra for?
841 */
842 n += 19;
843 break;
844 default:
845 /* if we stumble upon an unknown
846 formatting code, copy the rest of
847 the format string to the output
848 string. (we cannot just skip the
849 code, since there's no way to know
850 what's in the argument list) */
851 n += strlen(p);
852 goto expand;
853 }
854 } else
855 n++;
856 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000857 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000858 if (abuffersize > 20) {
859 abuffer = PyObject_Malloc(abuffersize);
860 if (!abuffer) {
861 PyErr_NoMemory();
862 goto fail;
863 }
864 realbuffer = abuffer;
865 }
866 else
867 realbuffer = buffer;
868 /* step 4: fill the buffer */
869 /* Since we've analyzed how much space we need for the worst case,
870 we don't have to resize the string.
871 There can be no errors beyond this point. */
872 string = PyUnicode_FromUnicode(NULL, n);
873 if (!string)
874 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000875
Benjamin Peterson857ce152009-01-31 16:29:18 +0000876 s = PyUnicode_AS_UNICODE(string);
877 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000878
Benjamin Peterson857ce152009-01-31 16:29:18 +0000879 for (f = format; *f; f++) {
880 if (*f == '%') {
881 const char* p = f++;
882 int longflag = 0;
883 int size_tflag = 0;
884 zeropad = (*f == '0');
885 /* parse the width.precision part */
886 width = 0;
887 while (isdigit((unsigned)*f))
888 width = (width*10) + *f++ - '0';
889 precision = 0;
890 if (*f == '.') {
891 f++;
892 while (isdigit((unsigned)*f))
893 precision = (precision*10) + *f++ - '0';
894 }
895 /* handle the long flag, but only for %ld and %lu.
896 others can be added when necessary. */
897 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
898 longflag = 1;
899 ++f;
900 }
901 /* handle the size_t flag. */
902 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
903 size_tflag = 1;
904 ++f;
905 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000906
Benjamin Peterson857ce152009-01-31 16:29:18 +0000907 switch (*f) {
908 case 'c':
909 *s++ = va_arg(vargs, int);
910 break;
911 case 'd':
912 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
913 if (longflag)
914 sprintf(realbuffer, fmt, va_arg(vargs, long));
915 else if (size_tflag)
916 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
917 else
918 sprintf(realbuffer, fmt, va_arg(vargs, int));
919 appendstring(realbuffer);
920 break;
921 case 'u':
922 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
923 if (longflag)
924 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
925 else if (size_tflag)
926 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
927 else
928 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
929 appendstring(realbuffer);
930 break;
931 case 'i':
932 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
933 sprintf(realbuffer, fmt, va_arg(vargs, int));
934 appendstring(realbuffer);
935 break;
936 case 'x':
937 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
938 sprintf(realbuffer, fmt, va_arg(vargs, int));
939 appendstring(realbuffer);
940 break;
941 case 's':
942 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000943 /* unused, since we already have the result */
944 (void) va_arg(vargs, char *);
945 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
946 PyUnicode_GET_SIZE(*callresult));
947 s += PyUnicode_GET_SIZE(*callresult);
948 /* We're done with the unicode()/repr() => forget it */
949 Py_DECREF(*callresult);
950 /* switch to next unicode()/repr() result */
951 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000952 break;
953 }
954 case 'U':
955 {
956 PyObject *obj = va_arg(vargs, PyObject *);
957 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
958 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
959 s += size;
960 break;
961 }
962 case 'V':
963 {
964 PyObject *obj = va_arg(vargs, PyObject *);
965 const char *str = va_arg(vargs, const char *);
966 if (obj) {
967 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
968 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
969 s += size;
970 } else {
971 appendstring(str);
972 }
973 break;
974 }
975 case 'S':
976 case 'R':
977 {
978 Py_UNICODE *ucopy;
979 Py_ssize_t usize;
980 Py_ssize_t upos;
981 /* unused, since we already have the result */
982 (void) va_arg(vargs, PyObject *);
983 ucopy = PyUnicode_AS_UNICODE(*callresult);
984 usize = PyUnicode_GET_SIZE(*callresult);
985 for (upos = 0; upos<usize;)
986 *s++ = ucopy[upos++];
987 /* We're done with the unicode()/repr() => forget it */
988 Py_DECREF(*callresult);
989 /* switch to next unicode()/repr() result */
990 ++callresult;
991 break;
992 }
993 case 'p':
994 sprintf(buffer, "%p", va_arg(vargs, void*));
995 /* %p is ill-defined: ensure leading 0x. */
996 if (buffer[1] == 'X')
997 buffer[1] = 'x';
998 else if (buffer[1] != 'x') {
999 memmove(buffer+2, buffer, strlen(buffer)+1);
1000 buffer[0] = '0';
1001 buffer[1] = 'x';
1002 }
1003 appendstring(buffer);
1004 break;
1005 case '%':
1006 *s++ = '%';
1007 break;
1008 default:
1009 appendstring(p);
1010 goto end;
1011 }
1012 } else
1013 *s++ = *f;
1014 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001015
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001016 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001017 if (callresults)
1018 PyObject_Free(callresults);
1019 if (abuffer)
1020 PyObject_Free(abuffer);
1021 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1022 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001023 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001024 if (callresults) {
1025 PyObject **callresult2 = callresults;
1026 while (callresult2 < callresult) {
1027 Py_DECREF(*callresult2);
1028 ++callresult2;
1029 }
1030 PyObject_Free(callresults);
1031 }
1032 if (abuffer)
1033 PyObject_Free(abuffer);
1034 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001035}
1036
1037#undef appendstring
1038
1039PyObject *
1040PyUnicode_FromFormat(const char *format, ...)
1041{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001042 PyObject* ret;
1043 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001044
1045#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001046 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001047#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001048 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001049#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001050 ret = PyUnicode_FromFormatV(format, vargs);
1051 va_end(vargs);
1052 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001053}
1054
Martin v. Löwis18e16552006-02-15 17:27:45 +00001055Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001056 wchar_t *w,
1057 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058{
1059 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001060 PyErr_BadInternalCall();
1061 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001063
1064 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001066 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001067
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068#ifdef HAVE_USABLE_WCHAR_T
1069 memcpy(w, unicode->str, size * sizeof(wchar_t));
1070#else
1071 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001072 register Py_UNICODE *u;
1073 register Py_ssize_t i;
1074 u = PyUnicode_AS_UNICODE(unicode);
1075 for (i = size; i > 0; i--)
1076 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001077 }
1078#endif
1079
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001080 if (size > PyUnicode_GET_SIZE(unicode))
1081 return PyUnicode_GET_SIZE(unicode);
1082 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001083 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084}
1085
1086#endif
1087
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001088PyObject *PyUnicode_FromOrdinal(int ordinal)
1089{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001090 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001091
1092#ifdef Py_UNICODE_WIDE
1093 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001094 PyErr_SetString(PyExc_ValueError,
1095 "unichr() arg not in range(0x110000) "
1096 "(wide Python build)");
1097 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001098 }
1099#else
1100 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001101 PyErr_SetString(PyExc_ValueError,
1102 "unichr() arg not in range(0x10000) "
1103 "(narrow Python build)");
1104 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001105 }
1106#endif
1107
Hye-Shik Chang40574832004-04-06 07:24:51 +00001108 s[0] = (Py_UNICODE)ordinal;
1109 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001110}
1111
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112PyObject *PyUnicode_FromObject(register PyObject *obj)
1113{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001114 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001115 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001116 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001117 Py_INCREF(obj);
1118 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001119 }
1120 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001121 /* For a Unicode subtype that's not a Unicode object,
1122 return a true Unicode object with the same data. */
1123 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1124 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001125 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001126 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1127}
1128
1129PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001130 const char *encoding,
1131 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001132{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001133 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001134 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001135 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001136
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001138 PyErr_BadInternalCall();
1139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001141
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001142#if 0
1143 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001144 that no encodings is given and then redirect to
1145 PyObject_Unicode() which then applies the additional logic for
1146 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001147
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001148 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001149 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150
1151 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001152 if (PyUnicode_Check(obj)) {
1153 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001154 PyErr_SetString(PyExc_TypeError,
1155 "decoding Unicode is not supported");
1156 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001157 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001158 return PyObject_Unicode(obj);
1159 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001160#else
1161 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001162 PyErr_SetString(PyExc_TypeError,
1163 "decoding Unicode is not supported");
1164 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001165 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001166#endif
1167
1168 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001169 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001170 s = PyString_AS_STRING(obj);
1171 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001172 }
Christian Heimes3497f942008-05-26 12:29:14 +00001173 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001174 /* Python 2.x specific */
1175 PyErr_Format(PyExc_TypeError,
1176 "decoding bytearray is not supported");
1177 return NULL;
1178 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001179 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001180 /* Overwrite the error message with something more useful in
1181 case of a TypeError. */
1182 if (PyErr_ExceptionMatches(PyExc_TypeError))
1183 PyErr_Format(PyExc_TypeError,
1184 "coercing to Unicode: need string or buffer, "
1185 "%.80s found",
1186 Py_TYPE(obj)->tp_name);
1187 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001188 }
Tim Petersced69f82003-09-16 20:30:58 +00001189
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001190 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001191 if (len == 0)
1192 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001193
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001194 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001195 return v;
1196
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001197 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001198 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199}
1200
1201PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001202 Py_ssize_t size,
1203 const char *encoding,
1204 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205{
1206 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001207
1208 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001209 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001210
1211 /* Shortcuts for common default encodings */
1212 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001214 else if (strcmp(encoding, "latin-1") == 0)
1215 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001216#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1217 else if (strcmp(encoding, "mbcs") == 0)
1218 return PyUnicode_DecodeMBCS(s, size, errors);
1219#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001220 else if (strcmp(encoding, "ascii") == 0)
1221 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222
1223 /* Decode via the codec registry */
1224 buffer = PyBuffer_FromMemory((void *)s, size);
1225 if (buffer == NULL)
1226 goto onError;
1227 unicode = PyCodec_Decode(buffer, encoding, errors);
1228 if (unicode == NULL)
1229 goto onError;
1230 if (!PyUnicode_Check(unicode)) {
1231 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001232 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001233 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 Py_DECREF(unicode);
1235 goto onError;
1236 }
1237 Py_DECREF(buffer);
1238 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001239
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001240 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241 Py_XDECREF(buffer);
1242 return NULL;
1243}
1244
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001245PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1246 const char *encoding,
1247 const char *errors)
1248{
1249 PyObject *v;
1250
1251 if (!PyUnicode_Check(unicode)) {
1252 PyErr_BadArgument();
1253 goto onError;
1254 }
1255
1256 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001257 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001258
1259 /* Decode via the codec registry */
1260 v = PyCodec_Decode(unicode, encoding, errors);
1261 if (v == NULL)
1262 goto onError;
1263 return v;
1264
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001265 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001266 return NULL;
1267}
1268
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001270 Py_ssize_t size,
1271 const char *encoding,
1272 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273{
1274 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001275
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 unicode = PyUnicode_FromUnicode(s, size);
1277 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1280 Py_DECREF(unicode);
1281 return v;
1282}
1283
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1285 const char *encoding,
1286 const char *errors)
1287{
1288 PyObject *v;
1289
1290 if (!PyUnicode_Check(unicode)) {
1291 PyErr_BadArgument();
1292 goto onError;
1293 }
1294
1295 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001296 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001297
1298 /* Encode via the codec registry */
1299 v = PyCodec_Encode(unicode, encoding, errors);
1300 if (v == NULL)
1301 goto onError;
1302 return v;
1303
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001304 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001305 return NULL;
1306}
1307
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1309 const char *encoding,
1310 const char *errors)
1311{
1312 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001313
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 if (!PyUnicode_Check(unicode)) {
1315 PyErr_BadArgument();
1316 goto onError;
1317 }
Fred Drakee4315f52000-05-09 19:53:39 +00001318
Tim Petersced69f82003-09-16 20:30:58 +00001319 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001320 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001321
1322 /* Shortcuts for common default encodings */
1323 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001324 if (strcmp(encoding, "utf-8") == 0)
1325 return PyUnicode_AsUTF8String(unicode);
1326 else if (strcmp(encoding, "latin-1") == 0)
1327 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001328#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001329 else if (strcmp(encoding, "mbcs") == 0)
1330 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001331#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001332 else if (strcmp(encoding, "ascii") == 0)
1333 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001335
1336 /* Encode via the codec registry */
1337 v = PyCodec_Encode(unicode, encoding, errors);
1338 if (v == NULL)
1339 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001340 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001342 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001343 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344 Py_DECREF(v);
1345 goto onError;
1346 }
1347 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001348
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001349 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350 return NULL;
1351}
1352
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001353PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001354 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001355{
1356 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1357
1358 if (v)
1359 return v;
1360 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1361 if (v && errors == NULL)
1362 ((PyUnicodeObject *)unicode)->defenc = v;
1363 return v;
1364}
1365
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1367{
1368 if (!PyUnicode_Check(unicode)) {
1369 PyErr_BadArgument();
1370 goto onError;
1371 }
1372 return PyUnicode_AS_UNICODE(unicode);
1373
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001374 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 return NULL;
1376}
1377
Martin v. Löwis18e16552006-02-15 17:27:45 +00001378Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379{
1380 if (!PyUnicode_Check(unicode)) {
1381 PyErr_BadArgument();
1382 goto onError;
1383 }
1384 return PyUnicode_GET_SIZE(unicode);
1385
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001386 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return -1;
1388}
1389
Thomas Wouters78890102000-07-22 19:25:51 +00001390const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001391{
1392 return unicode_default_encoding;
1393}
1394
1395int PyUnicode_SetDefaultEncoding(const char *encoding)
1396{
1397 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001398
Fred Drakee4315f52000-05-09 19:53:39 +00001399 /* Make sure the encoding is valid. As side effect, this also
1400 loads the encoding into the codec registry cache. */
1401 v = _PyCodec_Lookup(encoding);
1402 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001403 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001404 Py_DECREF(v);
1405 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001406 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001407 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001408 return 0;
1409
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001410 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001411 return -1;
1412}
1413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414/* error handling callback helper:
1415 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001416 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001417 and adjust various state variables.
1418 return 0 on success, -1 on error
1419*/
1420
1421static
1422int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001423 const char *encoding, const char *reason,
1424 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1425 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1426 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001427{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001428 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429
1430 PyObject *restuple = NULL;
1431 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001432 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1433 Py_ssize_t requiredsize;
1434 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001436 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001437 int res = -1;
1438
1439 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001440 *errorHandler = PyCodec_LookupError(errors);
1441 if (*errorHandler == NULL)
1442 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001443 }
1444
1445 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001446 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 encoding, input, insize, *startinpos, *endinpos, reason);
1448 if (*exceptionObject == NULL)
1449 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001450 }
1451 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001452 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1453 goto onError;
1454 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1455 goto onError;
1456 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1457 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001458 }
1459
1460 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1461 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001462 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001463 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001464 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001465 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001466 }
1467 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001468 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001469 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001470 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001471 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001472 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1473 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001474 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001475
1476 /* need more space? (at least enough for what we
1477 have+the replacement+the rest of the string (starting
1478 at the new input position), so we won't have to check space
1479 when there are no errors in the rest of the string) */
1480 repptr = PyUnicode_AS_UNICODE(repunicode);
1481 repsize = PyUnicode_GET_SIZE(repunicode);
1482 requiredsize = *outpos + repsize + insize-newpos;
1483 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001484 if (requiredsize<2*outsize)
1485 requiredsize = 2*outsize;
1486 if (_PyUnicode_Resize(output, requiredsize) < 0)
1487 goto onError;
1488 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001489 }
1490 *endinpos = newpos;
1491 *inptr = input + newpos;
1492 Py_UNICODE_COPY(*outptr, repptr, repsize);
1493 *outptr += repsize;
1494 *outpos += repsize;
1495 /* we made it! */
1496 res = 0;
1497
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001498 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001499 Py_XDECREF(restuple);
1500 return res;
1501}
1502
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001503/* --- UTF-7 Codec -------------------------------------------------------- */
1504
Antoine Pitrou653dece2009-05-04 18:32:32 +00001505/* See RFC2152 for details. We encode conservatively and decode liberally. */
1506
1507/* Three simple macros defining base-64. */
1508
1509/* Is c a base-64 character? */
1510
1511#define IS_BASE64(c) \
1512 (isalnum(c) || (c) == '+' || (c) == '/')
1513
1514/* given that c is a base-64 character, what is its base-64 value? */
1515
1516#define FROM_BASE64(c) \
1517 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1518 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1519 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1520 (c) == '+' ? 62 : 63)
1521
1522/* What is the base-64 character of the bottom 6 bits of n? */
1523
1524#define TO_BASE64(n) \
1525 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1526
1527/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1528 * decoded as itself. We are permissive on decoding; the only ASCII
1529 * byte not decoding to itself is the + which begins a base64
1530 * string. */
1531
1532#define DECODE_DIRECT(c) \
1533 ((c) <= 127 && (c) != '+')
1534
1535/* The UTF-7 encoder treats ASCII characters differently according to
1536 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1537 * the above). See RFC2152. This array identifies these different
1538 * sets:
1539 * 0 : "Set D"
1540 * alphanumeric and '(),-./:?
1541 * 1 : "Set O"
1542 * !"#$%&*;<=>@[]^_`{|}
1543 * 2 : "whitespace"
1544 * ht nl cr sp
1545 * 3 : special (must be base64 encoded)
1546 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1547 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548
Tim Petersced69f82003-09-16 20:30:58 +00001549static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001550char utf7_category[128] = {
1551/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1552 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1553/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1554 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1555/* sp ! " # $ % & ' ( ) * + , - . / */
1556 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1557/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1558 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1559/* @ A B C D E F G H I J K L M N O */
1560 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1561/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1562 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1563/* ` a b c d e f g h i j k l m n o */
1564 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1565/* p q r s t u v w x y z { | } ~ del */
1566 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567};
1568
Antoine Pitrou653dece2009-05-04 18:32:32 +00001569/* ENCODE_DIRECT: this character should be encoded as itself. The
1570 * answer depends on whether we are encoding set O as itself, and also
1571 * on whether we are encoding whitespace as itself. RFC2152 makes it
1572 * clear that the answers to these questions vary between
1573 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001574
Antoine Pitrou653dece2009-05-04 18:32:32 +00001575#define ENCODE_DIRECT(c, directO, directWS) \
1576 ((c) < 128 && (c) > 0 && \
1577 ((utf7_category[(c)] == 0) || \
1578 (directWS && (utf7_category[(c)] == 2)) || \
1579 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001580
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001582 Py_ssize_t size,
1583 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001584{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001585 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1586}
1587
Antoine Pitrou653dece2009-05-04 18:32:32 +00001588/* The decoder. The only state we preserve is our read position,
1589 * i.e. how many characters we have consumed. So if we end in the
1590 * middle of a shift sequence we have to back off the read position
1591 * and the output to the beginning of the sequence, otherwise we lose
1592 * all the shift state (seen bits, number of bits seen, high
1593 * surrogate). */
1594
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001595PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001596 Py_ssize_t size,
1597 const char *errors,
1598 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001599{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001601 Py_ssize_t startinpos;
1602 Py_ssize_t endinpos;
1603 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 const char *e;
1605 PyUnicodeObject *unicode;
1606 Py_UNICODE *p;
1607 const char *errmsg = "";
1608 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 Py_UNICODE *shiftOutStart;
1610 unsigned int base64bits = 0;
1611 unsigned long base64buffer = 0;
1612 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001613 PyObject *errorHandler = NULL;
1614 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001615
1616 unicode = _PyUnicode_New(size);
1617 if (!unicode)
1618 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001619 if (size == 0) {
1620 if (consumed)
1621 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001623 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624
1625 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001626 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627 e = s + size;
1628
1629 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001630 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001631
Antoine Pitrou653dece2009-05-04 18:32:32 +00001632 if (inShift) { /* in a base-64 section */
1633 if (IS_BASE64(ch)) { /* consume a base-64 character */
1634 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1635 base64bits += 6;
1636 s++;
1637 if (base64bits >= 16) {
1638 /* we have enough bits for a UTF-16 value */
1639 Py_UNICODE outCh = (Py_UNICODE)
1640 (base64buffer >> (base64bits-16));
1641 base64bits -= 16;
1642 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1643 if (surrogate) {
1644 /* expecting a second surrogate */
1645 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1646#ifdef Py_UNICODE_WIDE
1647 *p++ = (((surrogate & 0x3FF)<<10)
1648 | (outCh & 0x3FF)) + 0x10000;
1649#else
1650 *p++ = surrogate;
1651 *p++ = outCh;
1652#endif
1653 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001654 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001655 }
1656 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001657 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001658 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001659 }
1660 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001661 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001662 /* first surrogate */
1663 surrogate = outCh;
1664 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001665 else {
1666 *p++ = outCh;
1667 }
1668 }
1669 }
1670 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001671 inShift = 0;
1672 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001673 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001674 *p++ = surrogate;
1675 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001677 if (base64bits > 0) { /* left-over bits */
1678 if (base64bits >= 6) {
1679 /* We've seen at least one base-64 character */
1680 errmsg = "partial character in shift sequence";
1681 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001682 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001683 else {
1684 /* Some bits remain; they should be zero */
1685 if (base64buffer != 0) {
1686 errmsg = "non-zero padding bits in shift sequence";
1687 goto utf7Error;
1688 }
1689 }
1690 }
1691 if (ch != '-') {
1692 /* '-' is absorbed; other terminating
1693 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001694 *p++ = ch;
1695 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001696 }
1697 }
1698 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001699 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001700 s++; /* consume '+' */
1701 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001702 s++;
1703 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001704 }
1705 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001706 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001707 shiftOutStart = p;
1708 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001709 }
1710 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001711 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001712 *p++ = ch;
1713 s++;
1714 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001715 else {
1716 startinpos = s-starts;
1717 s++;
1718 errmsg = "unexpected special character";
1719 goto utf7Error;
1720 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001721 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001722utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 outpos = p-PyUnicode_AS_UNICODE(unicode);
1724 endinpos = s-starts;
1725 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001726 errors, &errorHandler,
1727 "utf7", errmsg,
1728 starts, size, &startinpos, &endinpos, &exc, &s,
1729 &unicode, &outpos, &p))
1730 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001731 }
1732
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733 /* end of string */
1734
1735 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1736 /* if we're in an inconsistent state, that's an error */
1737 if (surrogate ||
1738 (base64bits >= 6) ||
1739 (base64bits > 0 && base64buffer != 0)) {
1740 outpos = p-PyUnicode_AS_UNICODE(unicode);
1741 endinpos = size;
1742 if (unicode_decode_call_errorhandler(
1743 errors, &errorHandler,
1744 "utf7", "unterminated shift sequence",
1745 starts, size, &startinpos, &endinpos, &exc, &s,
1746 &unicode, &outpos, &p))
1747 goto onError;
1748 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001750
1751 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001752 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001753 if (inShift) {
1754 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001755 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001756 }
1757 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001758 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001759 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001760 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001761
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001762 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001763 goto onError;
1764
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001765 Py_XDECREF(errorHandler);
1766 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 return (PyObject *)unicode;
1768
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001769 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001770 Py_XDECREF(errorHandler);
1771 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001772 Py_DECREF(unicode);
1773 return NULL;
1774}
1775
1776
1777PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001778 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001779 int base64SetO,
1780 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001781 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001782{
1783 PyObject *v;
1784 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001785 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001786 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001787 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001788 unsigned int base64bits = 0;
1789 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001790 char * out;
1791 char * start;
1792
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001793 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001794 return PyErr_NoMemory();
1795
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001796 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001797 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001798
Antoine Pitrou653dece2009-05-04 18:32:32 +00001799 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001800 if (v == NULL)
1801 return NULL;
1802
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001803 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001804 for (;i < size; ++i) {
1805 Py_UNICODE ch = s[i];
1806
Antoine Pitrou653dece2009-05-04 18:32:32 +00001807 if (inShift) {
1808 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1809 /* shifting out */
1810 if (base64bits) { /* output remaining bits */
1811 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1812 base64buffer = 0;
1813 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001814 }
1815 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001816 /* Characters not in the BASE64 set implicitly unshift the sequence
1817 so no '-' is required, except if the character is itself a '-' */
1818 if (IS_BASE64(ch) || ch == '-') {
1819 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001820 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001821 *out++ = (char) ch;
1822 }
1823 else {
1824 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001825 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001826 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001827 else { /* not in a shift sequence */
1828 if (ch == '+') {
1829 *out++ = '+';
1830 *out++ = '-';
1831 }
1832 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1833 *out++ = (char) ch;
1834 }
1835 else {
1836 *out++ = '+';
1837 inShift = 1;
1838 goto encode_char;
1839 }
1840 }
1841 continue;
1842encode_char:
1843#ifdef Py_UNICODE_WIDE
1844 if (ch >= 0x10000) {
1845 /* code first surrogate */
1846 base64bits += 16;
1847 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1848 while (base64bits >= 6) {
1849 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1850 base64bits -= 6;
1851 }
1852 /* prepare second surrogate */
1853 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1854 }
1855#endif
1856 base64bits += 16;
1857 base64buffer = (base64buffer << 16) | ch;
1858 while (base64bits >= 6) {
1859 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1860 base64bits -= 6;
1861 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001862 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001863 if (base64bits)
1864 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1865 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001866 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001867
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001868 if (_PyString_Resize(&v, out - start))
1869 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001870 return v;
1871}
1872
Antoine Pitrou653dece2009-05-04 18:32:32 +00001873#undef IS_BASE64
1874#undef FROM_BASE64
1875#undef TO_BASE64
1876#undef DECODE_DIRECT
1877#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001878
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879/* --- UTF-8 Codec -------------------------------------------------------- */
1880
Tim Petersced69f82003-09-16 20:30:58 +00001881static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001883 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1884 illegal prefix. See RFC 3629 for details */
1885 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1886 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001887 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1891 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001892 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1897 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1898 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1899 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1900 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901};
1902
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001904 Py_ssize_t size,
1905 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906{
Walter Dörwald69652032004-09-07 20:24:22 +00001907 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1908}
1909
1910PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001911 Py_ssize_t size,
1912 const char *errors,
1913 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001914{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001915 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001916 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001917 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001918 Py_ssize_t startinpos;
1919 Py_ssize_t endinpos;
1920 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921 const char *e;
1922 PyUnicodeObject *unicode;
1923 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001924 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001925 PyObject *errorHandler = NULL;
1926 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927
1928 /* Note: size will always be longer than the resulting Unicode
1929 character count */
1930 unicode = _PyUnicode_New(size);
1931 if (!unicode)
1932 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001933 if (size == 0) {
1934 if (consumed)
1935 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938
1939 /* Unpack UTF-8 encoded data */
1940 p = unicode->str;
1941 e = s + size;
1942
1943 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001944 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945
1946 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001947 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948 s++;
1949 continue;
1950 }
1951
1952 n = utf8_code_length[ch];
1953
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001954 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001955 if (consumed)
1956 break;
1957 else {
1958 errmsg = "unexpected end of data";
1959 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001960 endinpos = startinpos+1;
1961 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1962 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001963 goto utf8Error;
1964 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966
1967 switch (n) {
1968
1969 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001970 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001971 startinpos = s-starts;
1972 endinpos = startinpos+1;
1973 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974
1975 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001976 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001977 startinpos = s-starts;
1978 endinpos = startinpos+1;
1979 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980
1981 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001982 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001983 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001984 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001985 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001986 goto utf8Error;
1987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001989 assert ((ch > 0x007F) && (ch <= 0x07FF));
1990 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991 break;
1992
1993 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001994 /* XXX: surrogates shouldn't be valid UTF-8!
1995 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1996 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1997 Uncomment the 2 lines below to make them invalid,
1998 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001999 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002000 (s[2] & 0xc0) != 0x80 ||
2001 ((unsigned char)s[0] == 0xE0 &&
2002 (unsigned char)s[1] < 0xA0)/* ||
2003 ((unsigned char)s[0] == 0xED &&
2004 (unsigned char)s[1] > 0x9F)*/) {
2005 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002006 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002007 endinpos = startinpos + 1;
2008
2009 /* if s[1] first two bits are 1 and 0, then the invalid
2010 continuation byte is s[2], so increment endinpos by 1,
2011 if not, s[1] is invalid and endinpos doesn't need to
2012 be incremented. */
2013 if ((s[1] & 0xC0) == 0x80)
2014 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002015 goto utf8Error;
2016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002018 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2019 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002020 break;
2021
2022 case 4:
2023 if ((s[1] & 0xc0) != 0x80 ||
2024 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002025 (s[3] & 0xc0) != 0x80 ||
2026 ((unsigned char)s[0] == 0xF0 &&
2027 (unsigned char)s[1] < 0x90) ||
2028 ((unsigned char)s[0] == 0xF4 &&
2029 (unsigned char)s[1] > 0x8F)) {
2030 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002031 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002032 endinpos = startinpos + 1;
2033 if ((s[1] & 0xC0) == 0x80) {
2034 endinpos++;
2035 if ((s[2] & 0xC0) == 0x80)
2036 endinpos++;
2037 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002038 goto utf8Error;
2039 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002040 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002041 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2042 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2043
Fredrik Lundh8f455852001-06-27 18:59:43 +00002044#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002045 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002046#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002047 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002048
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002049 /* translate from 10000..10FFFF to 0..FFFF */
2050 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002051
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002052 /* high surrogate = top 10 bits added to D800 */
2053 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002054
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002055 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002056 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002057#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 }
2060 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002061 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002062
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002063 utf8Error:
2064 outpos = p-PyUnicode_AS_UNICODE(unicode);
2065 if (unicode_decode_call_errorhandler(
2066 errors, &errorHandler,
2067 "utf8", errmsg,
2068 starts, size, &startinpos, &endinpos, &exc, &s,
2069 &unicode, &outpos, &p))
2070 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 }
Walter Dörwald69652032004-09-07 20:24:22 +00002072 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002073 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074
2075 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002076 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 goto onError;
2078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 Py_XDECREF(errorHandler);
2080 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 return (PyObject *)unicode;
2082
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002083 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 Py_XDECREF(errorHandler);
2085 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086 Py_DECREF(unicode);
2087 return NULL;
2088}
2089
Tim Peters602f7402002-04-27 18:03:26 +00002090/* Allocation strategy: if the string is short, convert into a stack buffer
2091 and allocate exactly as much space needed at the end. Else allocate the
2092 maximum possible needed (4 result bytes per Unicode character), and return
2093 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002094*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002095PyObject *
2096PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002097 Py_ssize_t size,
2098 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099{
Tim Peters602f7402002-04-27 18:03:26 +00002100#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002101
Martin v. Löwis18e16552006-02-15 17:27:45 +00002102 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002103 PyObject *v; /* result string object */
2104 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002105 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002106 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002107 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002108
Tim Peters602f7402002-04-27 18:03:26 +00002109 assert(s != NULL);
2110 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111
Tim Peters602f7402002-04-27 18:03:26 +00002112 if (size <= MAX_SHORT_UNICHARS) {
2113 /* Write into the stack buffer; nallocated can't overflow.
2114 * At the end, we'll allocate exactly as much heap space as it
2115 * turns out we need.
2116 */
2117 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2118 v = NULL; /* will allocate after we're done */
2119 p = stackbuf;
2120 }
2121 else {
2122 /* Overallocate on the heap, and give the excess back at the end. */
2123 nallocated = size * 4;
2124 if (nallocated / 4 != size) /* overflow! */
2125 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002126 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002127 if (v == NULL)
2128 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002129 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002130 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002131
Tim Peters602f7402002-04-27 18:03:26 +00002132 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002133 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002134
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002135 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002136 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002138
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002140 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002141 *p++ = (char)(0xc0 | (ch >> 6));
2142 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002143 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002144 else {
Tim Peters602f7402002-04-27 18:03:26 +00002145 /* Encode UCS2 Unicode ordinals */
2146 if (ch < 0x10000) {
2147 /* Special case: check for high surrogate */
2148 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2149 Py_UCS4 ch2 = s[i];
2150 /* Check for low surrogate and combine the two to
2151 form a UCS4 value */
2152 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002153 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002154 i++;
2155 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002156 }
Tim Peters602f7402002-04-27 18:03:26 +00002157 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002158 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002159 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002160 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2161 *p++ = (char)(0x80 | (ch & 0x3f));
2162 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002163 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002164 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002165 /* Encode UCS4 Unicode ordinals */
2166 *p++ = (char)(0xf0 | (ch >> 18));
2167 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2168 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2169 *p++ = (char)(0x80 | (ch & 0x3f));
2170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002172
Tim Peters602f7402002-04-27 18:03:26 +00002173 if (v == NULL) {
2174 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002175 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002176 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002177 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002178 }
2179 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002180 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002181 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002182 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002183 if (_PyString_Resize(&v, nneeded))
2184 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002187
Tim Peters602f7402002-04-27 18:03:26 +00002188#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189}
2190
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2192{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 if (!PyUnicode_Check(unicode)) {
2194 PyErr_BadArgument();
2195 return NULL;
2196 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002197 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002198 PyUnicode_GET_SIZE(unicode),
2199 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200}
2201
Walter Dörwald6e390802007-08-17 16:41:28 +00002202/* --- UTF-32 Codec ------------------------------------------------------- */
2203
2204PyObject *
2205PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002206 Py_ssize_t size,
2207 const char *errors,
2208 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002209{
2210 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2211}
2212
2213PyObject *
2214PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002215 Py_ssize_t size,
2216 const char *errors,
2217 int *byteorder,
2218 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002219{
2220 const char *starts = s;
2221 Py_ssize_t startinpos;
2222 Py_ssize_t endinpos;
2223 Py_ssize_t outpos;
2224 PyUnicodeObject *unicode;
2225 Py_UNICODE *p;
2226#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002227 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002228 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002229#else
2230 const int pairs = 0;
2231#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002232 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002233 int bo = 0; /* assume native ordering by default */
2234 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002235 /* Offsets from q for retrieving bytes in the right order. */
2236#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2237 int iorder[] = {0, 1, 2, 3};
2238#else
2239 int iorder[] = {3, 2, 1, 0};
2240#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002241 PyObject *errorHandler = NULL;
2242 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002243
Walter Dörwald6e390802007-08-17 16:41:28 +00002244 q = (unsigned char *)s;
2245 e = q + size;
2246
2247 if (byteorder)
2248 bo = *byteorder;
2249
2250 /* Check for BOM marks (U+FEFF) in the input and adjust current
2251 byte order setting accordingly. In native mode, the leading BOM
2252 mark is skipped, in all other modes, it is copied to the output
2253 stream as-is (giving a ZWNBSP character). */
2254 if (bo == 0) {
2255 if (size >= 4) {
2256 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002257 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002258#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002259 if (bom == 0x0000FEFF) {
2260 q += 4;
2261 bo = -1;
2262 }
2263 else if (bom == 0xFFFE0000) {
2264 q += 4;
2265 bo = 1;
2266 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002267#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002268 if (bom == 0x0000FEFF) {
2269 q += 4;
2270 bo = 1;
2271 }
2272 else if (bom == 0xFFFE0000) {
2273 q += 4;
2274 bo = -1;
2275 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002276#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002277 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002278 }
2279
2280 if (bo == -1) {
2281 /* force LE */
2282 iorder[0] = 0;
2283 iorder[1] = 1;
2284 iorder[2] = 2;
2285 iorder[3] = 3;
2286 }
2287 else if (bo == 1) {
2288 /* force BE */
2289 iorder[0] = 3;
2290 iorder[1] = 2;
2291 iorder[2] = 1;
2292 iorder[3] = 0;
2293 }
2294
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002295 /* On narrow builds we split characters outside the BMP into two
2296 codepoints => count how much extra space we need. */
2297#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002298 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002299 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2300 pairs++;
2301#endif
2302
2303 /* This might be one to much, because of a BOM */
2304 unicode = _PyUnicode_New((size+3)/4+pairs);
2305 if (!unicode)
2306 return NULL;
2307 if (size == 0)
2308 return (PyObject *)unicode;
2309
2310 /* Unpack UTF-32 encoded data */
2311 p = unicode->str;
2312
Walter Dörwald6e390802007-08-17 16:41:28 +00002313 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002314 Py_UCS4 ch;
2315 /* remaining bytes at the end? (size should be divisible by 4) */
2316 if (e-q<4) {
2317 if (consumed)
2318 break;
2319 errmsg = "truncated data";
2320 startinpos = ((const char *)q)-starts;
2321 endinpos = ((const char *)e)-starts;
2322 goto utf32Error;
2323 /* The remaining input chars are ignored if the callback
2324 chooses to skip the input */
2325 }
2326 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2327 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002328
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002329 if (ch >= 0x110000)
2330 {
2331 errmsg = "codepoint not in range(0x110000)";
2332 startinpos = ((const char *)q)-starts;
2333 endinpos = startinpos+4;
2334 goto utf32Error;
2335 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002336#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002337 if (ch >= 0x10000)
2338 {
2339 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2340 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2341 }
2342 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002343#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002344 *p++ = ch;
2345 q += 4;
2346 continue;
2347 utf32Error:
2348 outpos = p-PyUnicode_AS_UNICODE(unicode);
2349 if (unicode_decode_call_errorhandler(
2350 errors, &errorHandler,
2351 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002352 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002353 &unicode, &outpos, &p))
2354 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002355 }
2356
2357 if (byteorder)
2358 *byteorder = bo;
2359
2360 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002361 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002362
2363 /* Adjust length */
2364 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2365 goto onError;
2366
2367 Py_XDECREF(errorHandler);
2368 Py_XDECREF(exc);
2369 return (PyObject *)unicode;
2370
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002371 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002372 Py_DECREF(unicode);
2373 Py_XDECREF(errorHandler);
2374 Py_XDECREF(exc);
2375 return NULL;
2376}
2377
2378PyObject *
2379PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002380 Py_ssize_t size,
2381 const char *errors,
2382 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002383{
2384 PyObject *v;
2385 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002386 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002387#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002388 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002389#else
2390 const int pairs = 0;
2391#endif
2392 /* Offsets from p for storing byte pairs in the right order. */
2393#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2394 int iorder[] = {0, 1, 2, 3};
2395#else
2396 int iorder[] = {3, 2, 1, 0};
2397#endif
2398
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002399#define STORECHAR(CH) \
2400 do { \
2401 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2402 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2403 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2404 p[iorder[0]] = (CH) & 0xff; \
2405 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002406 } while(0)
2407
2408 /* In narrow builds we can output surrogate pairs as one codepoint,
2409 so we need less space. */
2410#ifndef Py_UNICODE_WIDE
2411 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002412 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2413 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2414 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002415#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002416 nsize = (size - pairs + (byteorder == 0));
2417 bytesize = nsize * 4;
2418 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002419 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002420 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002421 if (v == NULL)
2422 return NULL;
2423
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002424 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002425 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002426 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002427 if (size == 0)
2428 return v;
2429
2430 if (byteorder == -1) {
2431 /* force LE */
2432 iorder[0] = 0;
2433 iorder[1] = 1;
2434 iorder[2] = 2;
2435 iorder[3] = 3;
2436 }
2437 else if (byteorder == 1) {
2438 /* force BE */
2439 iorder[0] = 3;
2440 iorder[1] = 2;
2441 iorder[2] = 1;
2442 iorder[3] = 0;
2443 }
2444
2445 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002446 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002447#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002448 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2449 Py_UCS4 ch2 = *s;
2450 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2451 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2452 s++;
2453 size--;
2454 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002455 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002456#endif
2457 STORECHAR(ch);
2458 }
2459 return v;
2460#undef STORECHAR
2461}
2462
2463PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2464{
2465 if (!PyUnicode_Check(unicode)) {
2466 PyErr_BadArgument();
2467 return NULL;
2468 }
2469 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002470 PyUnicode_GET_SIZE(unicode),
2471 NULL,
2472 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002473}
2474
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475/* --- UTF-16 Codec ------------------------------------------------------- */
2476
Tim Peters772747b2001-08-09 22:21:55 +00002477PyObject *
2478PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002479 Py_ssize_t size,
2480 const char *errors,
2481 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482{
Walter Dörwald69652032004-09-07 20:24:22 +00002483 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2484}
2485
2486PyObject *
2487PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002488 Py_ssize_t size,
2489 const char *errors,
2490 int *byteorder,
2491 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002492{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002493 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002494 Py_ssize_t startinpos;
2495 Py_ssize_t endinpos;
2496 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 PyUnicodeObject *unicode;
2498 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002499 const unsigned char *q, *e;
2500 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002501 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002502 /* Offsets from q for retrieving byte pairs in the right order. */
2503#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2504 int ihi = 1, ilo = 0;
2505#else
2506 int ihi = 0, ilo = 1;
2507#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002508 PyObject *errorHandler = NULL;
2509 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510
2511 /* Note: size will always be longer than the resulting Unicode
2512 character count */
2513 unicode = _PyUnicode_New(size);
2514 if (!unicode)
2515 return NULL;
2516 if (size == 0)
2517 return (PyObject *)unicode;
2518
2519 /* Unpack UTF-16 encoded data */
2520 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002521 q = (unsigned char *)s;
2522 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523
2524 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002525 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002527 /* Check for BOM marks (U+FEFF) in the input and adjust current
2528 byte order setting accordingly. In native mode, the leading BOM
2529 mark is skipped, in all other modes, it is copied to the output
2530 stream as-is (giving a ZWNBSP character). */
2531 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002532 if (size >= 2) {
2533 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002534#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002535 if (bom == 0xFEFF) {
2536 q += 2;
2537 bo = -1;
2538 }
2539 else if (bom == 0xFFFE) {
2540 q += 2;
2541 bo = 1;
2542 }
Tim Petersced69f82003-09-16 20:30:58 +00002543#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002544 if (bom == 0xFEFF) {
2545 q += 2;
2546 bo = 1;
2547 }
2548 else if (bom == 0xFFFE) {
2549 q += 2;
2550 bo = -1;
2551 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002552#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002553 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555
Tim Peters772747b2001-08-09 22:21:55 +00002556 if (bo == -1) {
2557 /* force LE */
2558 ihi = 1;
2559 ilo = 0;
2560 }
2561 else if (bo == 1) {
2562 /* force BE */
2563 ihi = 0;
2564 ilo = 1;
2565 }
2566
2567 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002568 Py_UNICODE ch;
2569 /* remaining bytes at the end? (size should be even) */
2570 if (e-q<2) {
2571 if (consumed)
2572 break;
2573 errmsg = "truncated data";
2574 startinpos = ((const char *)q)-starts;
2575 endinpos = ((const char *)e)-starts;
2576 goto utf16Error;
2577 /* The remaining input chars are ignored if the callback
2578 chooses to skip the input */
2579 }
2580 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002581
Benjamin Peterson857ce152009-01-31 16:29:18 +00002582 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002583
2584 if (ch < 0xD800 || ch > 0xDFFF) {
2585 *p++ = ch;
2586 continue;
2587 }
2588
2589 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002590 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002591 q -= 2;
2592 if (consumed)
2593 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002594 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002595 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002596 endinpos = ((const char *)e)-starts;
2597 goto utf16Error;
2598 }
2599 if (0xD800 <= ch && ch <= 0xDBFF) {
2600 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2601 q += 2;
2602 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002603#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002604 *p++ = ch;
2605 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002606#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002607 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002608#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002609 continue;
2610 }
2611 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002612 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002613 startinpos = (((const char *)q)-4)-starts;
2614 endinpos = startinpos+2;
2615 goto utf16Error;
2616 }
2617
Benjamin Peterson857ce152009-01-31 16:29:18 +00002618 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002619 errmsg = "illegal encoding";
2620 startinpos = (((const char *)q)-2)-starts;
2621 endinpos = startinpos+2;
2622 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002623
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002624 utf16Error:
2625 outpos = p-PyUnicode_AS_UNICODE(unicode);
2626 if (unicode_decode_call_errorhandler(
2627 errors, &errorHandler,
2628 "utf16", errmsg,
2629 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2630 &unicode, &outpos, &p))
2631 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 }
2633
2634 if (byteorder)
2635 *byteorder = bo;
2636
Walter Dörwald69652032004-09-07 20:24:22 +00002637 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002638 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002639
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002641 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 goto onError;
2643
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002644 Py_XDECREF(errorHandler);
2645 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 return (PyObject *)unicode;
2647
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002648 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002650 Py_XDECREF(errorHandler);
2651 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652 return NULL;
2653}
2654
Tim Peters772747b2001-08-09 22:21:55 +00002655PyObject *
2656PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002657 Py_ssize_t size,
2658 const char *errors,
2659 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660{
2661 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002662 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002663 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002664#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002665 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002666#else
2667 const int pairs = 0;
2668#endif
Tim Peters772747b2001-08-09 22:21:55 +00002669 /* Offsets from p for storing byte pairs in the right order. */
2670#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2671 int ihi = 1, ilo = 0;
2672#else
2673 int ihi = 0, ilo = 1;
2674#endif
2675
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002676#define STORECHAR(CH) \
2677 do { \
2678 p[ihi] = ((CH) >> 8) & 0xff; \
2679 p[ilo] = (CH) & 0xff; \
2680 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002681 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002683#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002684 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002685 if (s[i] >= 0x10000)
2686 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002687#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002688 /* 2 * (size + pairs + (byteorder == 0)) */
2689 if (size > PY_SSIZE_T_MAX ||
2690 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002691 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002692 nsize = size + pairs + (byteorder == 0);
2693 bytesize = nsize * 2;
2694 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002695 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002696 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 if (v == NULL)
2698 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002700 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002702 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002703 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002704 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002705
2706 if (byteorder == -1) {
2707 /* force LE */
2708 ihi = 1;
2709 ilo = 0;
2710 }
2711 else if (byteorder == 1) {
2712 /* force BE */
2713 ihi = 0;
2714 ilo = 1;
2715 }
2716
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002717 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002718 Py_UNICODE ch = *s++;
2719 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002720#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002721 if (ch >= 0x10000) {
2722 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2723 ch = 0xD800 | ((ch-0x10000) >> 10);
2724 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002725#endif
Tim Peters772747b2001-08-09 22:21:55 +00002726 STORECHAR(ch);
2727 if (ch2)
2728 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002731#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732}
2733
2734PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2735{
2736 if (!PyUnicode_Check(unicode)) {
2737 PyErr_BadArgument();
2738 return NULL;
2739 }
2740 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002741 PyUnicode_GET_SIZE(unicode),
2742 NULL,
2743 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744}
2745
2746/* --- Unicode Escape Codec ----------------------------------------------- */
2747
Fredrik Lundh06d12682001-01-24 07:59:11 +00002748static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002749
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002751 Py_ssize_t size,
2752 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002754 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002755 Py_ssize_t startinpos;
2756 Py_ssize_t endinpos;
2757 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002759 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002761 char* message;
2762 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002763 PyObject *errorHandler = NULL;
2764 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002765
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 /* Escaped strings will always be longer than the resulting
2767 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 length after conversion to the true value.
2769 (but if the error callback returns a long replacement string
2770 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771 v = _PyUnicode_New(size);
2772 if (v == NULL)
2773 goto onError;
2774 if (size == 0)
2775 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002776
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002777 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002779
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 while (s < end) {
2781 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002782 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002783 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784
2785 /* Non-escape characters are interpreted as Unicode ordinals */
2786 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002787 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788 continue;
2789 }
2790
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002791 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 /* \ - Escapes */
2793 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002794 c = *s++;
2795 if (s > end)
2796 c = '\0'; /* Invalid after \ */
2797 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002799 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800 case '\n': break;
2801 case '\\': *p++ = '\\'; break;
2802 case '\'': *p++ = '\''; break;
2803 case '\"': *p++ = '\"'; break;
2804 case 'b': *p++ = '\b'; break;
2805 case 'f': *p++ = '\014'; break; /* FF */
2806 case 't': *p++ = '\t'; break;
2807 case 'n': *p++ = '\n'; break;
2808 case 'r': *p++ = '\r'; break;
2809 case 'v': *p++ = '\013'; break; /* VT */
2810 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2811
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002812 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 case '0': case '1': case '2': case '3':
2814 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002815 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002816 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002817 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002818 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002819 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002821 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 break;
2823
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002824 /* hex escapes */
2825 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002827 digits = 2;
2828 message = "truncated \\xXX escape";
2829 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002831 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002833 digits = 4;
2834 message = "truncated \\uXXXX escape";
2835 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002837 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002838 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002839 digits = 8;
2840 message = "truncated \\UXXXXXXXX escape";
2841 hexescape:
2842 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002843 if (end - s < digits) {
2844 /* count only hex digits */
2845 for (; s < end; ++s) {
2846 c = (unsigned char)*s;
2847 if (!Py_ISXDIGIT(c))
2848 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002849 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002850 goto error;
2851 }
2852 for (; digits--; ++s) {
2853 c = (unsigned char)*s;
2854 if (!Py_ISXDIGIT(c))
2855 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002856 chr = (chr<<4) & ~0xF;
2857 if (c >= '0' && c <= '9')
2858 chr += c - '0';
2859 else if (c >= 'a' && c <= 'f')
2860 chr += 10 + c - 'a';
2861 else
2862 chr += 10 + c - 'A';
2863 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002864 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865 /* _decoding_error will have already written into the
2866 target buffer. */
2867 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002868 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002869 /* when we get here, chr is a 32-bit unicode character */
2870 if (chr <= 0xffff)
2871 /* UCS-2 character */
2872 *p++ = (Py_UNICODE) chr;
2873 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002874 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002875 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002876#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002877 *p++ = chr;
2878#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002879 chr -= 0x10000L;
2880 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002881 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002882#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002883 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002884 message = "illegal Unicode character";
2885 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002886 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002887 break;
2888
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002889 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002890 case 'N':
2891 message = "malformed \\N character escape";
2892 if (ucnhash_CAPI == NULL) {
2893 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002894 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002895 if (ucnhash_CAPI == NULL)
2896 goto ucnhashError;
2897 }
2898 if (*s == '{') {
2899 const char *start = s+1;
2900 /* look for the closing brace */
2901 while (*s != '}' && s < end)
2902 s++;
2903 if (s > start && s < end && *s == '}') {
2904 /* found a name. look it up in the unicode database */
2905 message = "unknown Unicode character name";
2906 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002907 if (s - start - 1 <= INT_MAX &&
2908 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002909 goto store;
2910 }
2911 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002912 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002913
2914 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002915 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002916 message = "\\ at end of string";
2917 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002918 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002919 }
2920 else {
2921 *p++ = '\\';
2922 *p++ = (unsigned char)s[-1];
2923 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002924 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002926 continue;
2927
2928 error:
2929 endinpos = s-starts;
2930 outpos = p-PyUnicode_AS_UNICODE(v);
2931 if (unicode_decode_call_errorhandler(
2932 errors, &errorHandler,
2933 "unicodeescape", message,
2934 starts, size, &startinpos, &endinpos, &exc, &s,
2935 &v, &outpos, &p))
2936 goto onError;
2937 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002939 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002941 Py_XDECREF(errorHandler);
2942 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002944
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002945 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002946 PyErr_SetString(
2947 PyExc_UnicodeError,
2948 "\\N escapes not supported (can't load unicodedata module)"
2949 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002950 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002951 Py_XDECREF(errorHandler);
2952 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002953 return NULL;
2954
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002955 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 Py_XDECREF(errorHandler);
2958 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959 return NULL;
2960}
2961
2962/* Return a Unicode-Escape string version of the Unicode object.
2963
2964 If quotes is true, the string is enclosed in u"" or u'' quotes as
2965 appropriate.
2966
2967*/
2968
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002969Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002970 Py_ssize_t size,
2971 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002972{
2973 /* like wcschr, but doesn't stop at NULL characters */
2974
2975 while (size-- > 0) {
2976 if (*s == ch)
2977 return s;
2978 s++;
2979 }
2980
2981 return NULL;
2982}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002983
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984static
2985PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002986 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 int quotes)
2988{
2989 PyObject *repr;
2990 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002992 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002993#ifdef Py_UNICODE_WIDE
2994 const Py_ssize_t expandsize = 10;
2995#else
2996 const Py_ssize_t expandsize = 6;
2997#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998
Neal Norwitz17753ec2006-08-21 22:21:19 +00002999 /* XXX(nnorwitz): rather than over-allocating, it would be
3000 better to choose a different scheme. Perhaps scan the
3001 first N-chars of the string and allocate based on that size.
3002 */
3003 /* Initial allocation is based on the longest-possible unichr
3004 escape.
3005
3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007 unichr, so in this case it's the longest unichr escape. In
3008 narrow (UTF-16) builds this is five chars per source unichr
3009 since there are two unichrs in the surrogate pair, so in narrow
3010 (UTF-16) builds it's not the longest unichr escape.
3011
3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013 so in the narrow (UTF-16) build case it's the longest unichr
3014 escape.
3015 */
3016
Neal Norwitze7d8be82008-07-31 17:17:14 +00003017 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003018 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003019
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003020 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003021 2
3022 + expandsize*size
3023 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 if (repr == NULL)
3025 return NULL;
3026
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003027 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028
3029 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003031 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 !findchar(s, size, '"')) ? '"' : '\'';
3033 }
3034 while (size-- > 0) {
3035 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003036
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003037 /* Escape quotes and backslashes */
3038 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003039 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 *p++ = '\\';
3041 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003042 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003043 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003044
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003045#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003046 /* Map 21-bit characters to '\U00xxxxxx' */
3047 else if (ch >= 0x10000) {
3048 *p++ = '\\';
3049 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003050 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003057 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003058 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003059 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003060#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062 else if (ch >= 0xD800 && ch < 0xDC00) {
3063 Py_UNICODE ch2;
3064 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003065
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003066 ch2 = *s++;
3067 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003068 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003069 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070 *p++ = '\\';
3071 *p++ = 'U';
3072 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079 *p++ = hexdigit[ucs & 0x0000000F];
3080 continue;
3081 }
3082 /* Fall through: isolated surrogates are copied as-is */
3083 s--;
3084 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003085 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003086#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003087
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003089 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 *p++ = '\\';
3091 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003092 *p++ = hexdigit[(ch >> 12) & 0x000F];
3093 *p++ = hexdigit[(ch >> 8) & 0x000F];
3094 *p++ = hexdigit[(ch >> 4) & 0x000F];
3095 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003097
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003098 /* Map special whitespace to '\t', \n', '\r' */
3099 else if (ch == '\t') {
3100 *p++ = '\\';
3101 *p++ = 't';
3102 }
3103 else if (ch == '\n') {
3104 *p++ = '\\';
3105 *p++ = 'n';
3106 }
3107 else if (ch == '\r') {
3108 *p++ = '\\';
3109 *p++ = 'r';
3110 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003111
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003112 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003113 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003115 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003116 *p++ = hexdigit[(ch >> 4) & 0x000F];
3117 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003118 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003119
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 /* Copy everything else as-is */
3121 else
3122 *p++ = (char) ch;
3123 }
3124 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003125 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126
3127 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003128 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 return repr;
3131}
3132
3133PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003134 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135{
3136 return unicodeescape_string(s, size, 0);
3137}
3138
3139PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3140{
3141 if (!PyUnicode_Check(unicode)) {
3142 PyErr_BadArgument();
3143 return NULL;
3144 }
3145 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003146 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147}
3148
3149/* --- Raw Unicode Escape Codec ------------------------------------------- */
3150
3151PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003152 Py_ssize_t size,
3153 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003155 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003156 Py_ssize_t startinpos;
3157 Py_ssize_t endinpos;
3158 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003160 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 const char *end;
3162 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003163 PyObject *errorHandler = NULL;
3164 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003165
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 /* Escaped strings will always be longer than the resulting
3167 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003168 length after conversion to the true value. (But decoding error
3169 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 v = _PyUnicode_New(size);
3171 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003172 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003174 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003175 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 end = s + size;
3177 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003178 unsigned char c;
3179 Py_UCS4 x;
3180 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003181 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003183 /* Non-escape characters are interpreted as Unicode ordinals */
3184 if (*s != '\\') {
3185 *p++ = (unsigned char)*s++;
3186 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003187 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003188 startinpos = s-starts;
3189
3190 /* \u-escapes are only interpreted iff the number of leading
3191 backslashes if odd */
3192 bs = s;
3193 for (;s < end;) {
3194 if (*s != '\\')
3195 break;
3196 *p++ = (unsigned char)*s++;
3197 }
3198 if (((s - bs) & 1) == 0 ||
3199 s >= end ||
3200 (*s != 'u' && *s != 'U')) {
3201 continue;
3202 }
3203 p--;
3204 count = *s=='u' ? 4 : 8;
3205 s++;
3206
3207 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3208 outpos = p-PyUnicode_AS_UNICODE(v);
3209 for (x = 0, i = 0; i < count; ++i, ++s) {
3210 c = (unsigned char)*s;
3211 if (!isxdigit(c)) {
3212 endinpos = s-starts;
3213 if (unicode_decode_call_errorhandler(
3214 errors, &errorHandler,
3215 "rawunicodeescape", "truncated \\uXXXX",
3216 starts, size, &startinpos, &endinpos, &exc, &s,
3217 &v, &outpos, &p))
3218 goto onError;
3219 goto nextByte;
3220 }
3221 x = (x<<4) & ~0xF;
3222 if (c >= '0' && c <= '9')
3223 x += c - '0';
3224 else if (c >= 'a' && c <= 'f')
3225 x += 10 + c - 'a';
3226 else
3227 x += 10 + c - 'A';
3228 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003229 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003230 /* UCS-2 character */
3231 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003232 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003233 /* UCS-4 character. Either store directly, or as
3234 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003235#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003236 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003237#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003238 x -= 0x10000L;
3239 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3240 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003241#endif
3242 } else {
3243 endinpos = s-starts;
3244 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003245 if (unicode_decode_call_errorhandler(
3246 errors, &errorHandler,
3247 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003248 starts, size, &startinpos, &endinpos, &exc, &s,
3249 &v, &outpos, &p))
3250 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003251 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003252 nextByte:
3253 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003255 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003256 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257 Py_XDECREF(errorHandler);
3258 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003260
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003261 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 return NULL;
3266}
3267
3268PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003269 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270{
3271 PyObject *repr;
3272 char *p;
3273 char *q;
3274
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003275 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003276#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003277 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003278#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003279 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003280#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003281
Neal Norwitze7d8be82008-07-31 17:17:14 +00003282 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003283 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003284
Neal Norwitze7d8be82008-07-31 17:17:14 +00003285 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 if (repr == NULL)
3287 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003288 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003289 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003291 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 while (size-- > 0) {
3293 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003294#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003295 /* Map 32-bit characters to '\Uxxxxxxxx' */
3296 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003297 *p++ = '\\';
3298 *p++ = 'U';
3299 *p++ = hexdigit[(ch >> 28) & 0xf];
3300 *p++ = hexdigit[(ch >> 24) & 0xf];
3301 *p++ = hexdigit[(ch >> 20) & 0xf];
3302 *p++ = hexdigit[(ch >> 16) & 0xf];
3303 *p++ = hexdigit[(ch >> 12) & 0xf];
3304 *p++ = hexdigit[(ch >> 8) & 0xf];
3305 *p++ = hexdigit[(ch >> 4) & 0xf];
3306 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003307 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003308 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003309#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003310 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3311 if (ch >= 0xD800 && ch < 0xDC00) {
3312 Py_UNICODE ch2;
3313 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003314
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003315 ch2 = *s++;
3316 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003317 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003318 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3319 *p++ = '\\';
3320 *p++ = 'U';
3321 *p++ = hexdigit[(ucs >> 28) & 0xf];
3322 *p++ = hexdigit[(ucs >> 24) & 0xf];
3323 *p++ = hexdigit[(ucs >> 20) & 0xf];
3324 *p++ = hexdigit[(ucs >> 16) & 0xf];
3325 *p++ = hexdigit[(ucs >> 12) & 0xf];
3326 *p++ = hexdigit[(ucs >> 8) & 0xf];
3327 *p++ = hexdigit[(ucs >> 4) & 0xf];
3328 *p++ = hexdigit[ucs & 0xf];
3329 continue;
3330 }
3331 /* Fall through: isolated surrogates are copied as-is */
3332 s--;
3333 size++;
3334 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003335#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003336 /* Map 16-bit characters to '\uxxxx' */
3337 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 *p++ = '\\';
3339 *p++ = 'u';
3340 *p++ = hexdigit[(ch >> 12) & 0xf];
3341 *p++ = hexdigit[(ch >> 8) & 0xf];
3342 *p++ = hexdigit[(ch >> 4) & 0xf];
3343 *p++ = hexdigit[ch & 15];
3344 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003345 /* Copy everything else as-is */
3346 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 *p++ = (char) ch;
3348 }
3349 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003350 if (_PyString_Resize(&repr, p - q))
3351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 return repr;
3353}
3354
3355PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3356{
3357 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003358 PyErr_BadArgument();
3359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 }
3361 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003362 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363}
3364
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003365/* --- Unicode Internal Codec ------------------------------------------- */
3366
3367PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003368 Py_ssize_t size,
3369 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003370{
3371 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003372 Py_ssize_t startinpos;
3373 Py_ssize_t endinpos;
3374 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003375 PyUnicodeObject *v;
3376 Py_UNICODE *p;
3377 const char *end;
3378 const char *reason;
3379 PyObject *errorHandler = NULL;
3380 PyObject *exc = NULL;
3381
Neal Norwitzd43069c2006-01-08 01:12:10 +00003382#ifdef Py_UNICODE_WIDE
3383 Py_UNICODE unimax = PyUnicode_GetMax();
3384#endif
3385
Armin Rigo7ccbca92006-10-04 12:17:45 +00003386 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003387 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3388 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003389 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003390 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003391 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003392 p = PyUnicode_AS_UNICODE(v);
3393 end = s + size;
3394
3395 while (s < end) {
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003396 if (end-s < Py_UNICODE_SIZE) {
3397 endinpos = end-starts;
3398 reason = "truncated input";
3399 goto error;
3400 }
Neal Norwitz1004a532006-05-15 07:17:23 +00003401 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003402#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003403 /* We have to sanity check the raw data, otherwise doom looms for
3404 some malformed UCS-4 data. */
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003405 if (*p > unimax || *p < 0) {
3406 endinpos = s - starts + Py_UNICODE_SIZE;
3407 reason = "illegal code point (> 0x10FFFF)";
3408 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003409 }
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003410#endif
3411 p++;
3412 s += Py_UNICODE_SIZE;
3413 continue;
3414
3415 error:
3416 startinpos = s - starts;
3417 outpos = p - PyUnicode_AS_UNICODE(v);
3418 if (unicode_decode_call_errorhandler(
3419 errors, &errorHandler,
3420 "unicode_internal", reason,
3421 starts, size, &startinpos, &endinpos, &exc, &s,
3422 &v, &outpos, &p)) {
3423 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003424 }
3425 }
3426
Martin v. Löwis412fb672006-04-13 06:34:32 +00003427 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003428 goto onError;
3429 Py_XDECREF(errorHandler);
3430 Py_XDECREF(exc);
3431 return (PyObject *)v;
3432
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003433 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003434 Py_XDECREF(v);
3435 Py_XDECREF(errorHandler);
3436 Py_XDECREF(exc);
3437 return NULL;
3438}
3439
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440/* --- Latin-1 Codec ------------------------------------------------------ */
3441
3442PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003443 Py_ssize_t size,
3444 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445{
3446 PyUnicodeObject *v;
3447 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003448
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003450 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003451 Py_UNICODE r = *(unsigned char*)s;
3452 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003453 }
3454
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 v = _PyUnicode_New(size);
3456 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003457 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003459 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 p = PyUnicode_AS_UNICODE(v);
3461 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003462 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003464
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003465 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 Py_XDECREF(v);
3467 return NULL;
3468}
3469
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470/* create or adjust a UnicodeEncodeError */
3471static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003472 const char *encoding,
3473 const Py_UNICODE *unicode, Py_ssize_t size,
3474 Py_ssize_t startpos, Py_ssize_t endpos,
3475 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003478 *exceptionObject = PyUnicodeEncodeError_Create(
3479 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003480 }
3481 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003482 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3483 goto onError;
3484 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3485 goto onError;
3486 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3487 goto onError;
3488 return;
3489 onError:
3490 Py_DECREF(*exceptionObject);
3491 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492 }
3493}
3494
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495/* raises a UnicodeEncodeError */
3496static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003497 const char *encoding,
3498 const Py_UNICODE *unicode, Py_ssize_t size,
3499 Py_ssize_t startpos, Py_ssize_t endpos,
3500 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501{
3502 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003503 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003505 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506}
3507
3508/* error handling callback helper:
3509 build arguments, call the callback and check the arguments,
3510 put the result into newpos and return the replacement string, which
3511 has to be freed by the caller */
3512static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003513 PyObject **errorHandler,
3514 const char *encoding, const char *reason,
3515 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3516 Py_ssize_t startpos, Py_ssize_t endpos,
3517 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003519 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520
3521 PyObject *restuple;
3522 PyObject *resunicode;
3523
3524 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003525 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003527 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 }
3529
3530 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003531 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003533 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534
3535 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003536 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003538 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003540 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003541 Py_DECREF(restuple);
3542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 }
3544 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003545 &resunicode, newpos)) {
3546 Py_DECREF(restuple);
3547 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 }
3549 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003550 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003551 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003552 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3553 Py_DECREF(restuple);
3554 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003555 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 Py_INCREF(resunicode);
3557 Py_DECREF(restuple);
3558 return resunicode;
3559}
3560
3561static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003562 Py_ssize_t size,
3563 const char *errors,
3564 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565{
3566 /* output object */
3567 PyObject *res;
3568 /* pointers to the beginning and end+1 of input */
3569 const Py_UNICODE *startp = p;
3570 const Py_UNICODE *endp = p + size;
3571 /* pointer to the beginning of the unencodable characters */
3572 /* const Py_UNICODE *badp = NULL; */
3573 /* pointer into the output */
3574 char *str;
3575 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003576 Py_ssize_t respos = 0;
3577 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003578 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3579 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 PyObject *errorHandler = NULL;
3581 PyObject *exc = NULL;
3582 /* the following variable is used for caching string comparisons
3583 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3584 int known_errorHandler = -1;
3585
3586 /* allocate enough for a simple encoding without
3587 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003588 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 if (res == NULL)
3590 goto onError;
3591 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003592 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003593 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 ressize = size;
3595
3596 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003597 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003599 /* can we encode this? */
3600 if (c<limit) {
3601 /* no overflow check, because we know that the space is enough */
3602 *str++ = (char)c;
3603 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003604 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003605 else {
3606 Py_ssize_t unicodepos = p-startp;
3607 Py_ssize_t requiredsize;
3608 PyObject *repunicode;
3609 Py_ssize_t repsize;
3610 Py_ssize_t newpos;
3611 Py_ssize_t respos;
3612 Py_UNICODE *uni2;
3613 /* startpos for collecting unencodable chars */
3614 const Py_UNICODE *collstart = p;
3615 const Py_UNICODE *collend = p;
3616 /* find all unecodable characters */
3617 while ((collend < endp) && ((*collend)>=limit))
3618 ++collend;
3619 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3620 if (known_errorHandler==-1) {
3621 if ((errors==NULL) || (!strcmp(errors, "strict")))
3622 known_errorHandler = 1;
3623 else if (!strcmp(errors, "replace"))
3624 known_errorHandler = 2;
3625 else if (!strcmp(errors, "ignore"))
3626 known_errorHandler = 3;
3627 else if (!strcmp(errors, "xmlcharrefreplace"))
3628 known_errorHandler = 4;
3629 else
3630 known_errorHandler = 0;
3631 }
3632 switch (known_errorHandler) {
3633 case 1: /* strict */
3634 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3635 goto onError;
3636 case 2: /* replace */
3637 while (collstart++<collend)
3638 *str++ = '?'; /* fall through */
3639 case 3: /* ignore */
3640 p = collend;
3641 break;
3642 case 4: /* xmlcharrefreplace */
3643 respos = str-PyString_AS_STRING(res);
3644 /* determine replacement size (temporarily (mis)uses p) */
3645 for (p = collstart, repsize = 0; p < collend; ++p) {
3646 if (*p<10)
3647 repsize += 2+1+1;
3648 else if (*p<100)
3649 repsize += 2+2+1;
3650 else if (*p<1000)
3651 repsize += 2+3+1;
3652 else if (*p<10000)
3653 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003654#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003655 else
3656 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003657#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003658 else if (*p<100000)
3659 repsize += 2+5+1;
3660 else if (*p<1000000)
3661 repsize += 2+6+1;
3662 else
3663 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003664#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003665 }
3666 requiredsize = respos+repsize+(endp-collend);
3667 if (requiredsize > ressize) {
3668 if (requiredsize<2*ressize)
3669 requiredsize = 2*ressize;
3670 if (_PyString_Resize(&res, requiredsize))
3671 goto onError;
3672 str = PyString_AS_STRING(res) + respos;
3673 ressize = requiredsize;
3674 }
3675 /* generate replacement (temporarily (mis)uses p) */
3676 for (p = collstart; p < collend; ++p) {
3677 str += sprintf(str, "&#%d;", (int)*p);
3678 }
3679 p = collend;
3680 break;
3681 default:
3682 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3683 encoding, reason, startp, size, &exc,
3684 collstart-startp, collend-startp, &newpos);
3685 if (repunicode == NULL)
3686 goto onError;
3687 /* need more space? (at least enough for what we have+the
3688 replacement+the rest of the string, so we won't have to
3689 check space for encodable characters) */
3690 respos = str-PyString_AS_STRING(res);
3691 repsize = PyUnicode_GET_SIZE(repunicode);
3692 requiredsize = respos+repsize+(endp-collend);
3693 if (requiredsize > ressize) {
3694 if (requiredsize<2*ressize)
3695 requiredsize = 2*ressize;
3696 if (_PyString_Resize(&res, requiredsize)) {
3697 Py_DECREF(repunicode);
3698 goto onError;
3699 }
3700 str = PyString_AS_STRING(res) + respos;
3701 ressize = requiredsize;
3702 }
3703 /* check if there is anything unencodable in the replacement
3704 and copy it to the output */
3705 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3706 c = *uni2;
3707 if (c >= limit) {
3708 raise_encode_exception(&exc, encoding, startp, size,
3709 unicodepos, unicodepos+1, reason);
3710 Py_DECREF(repunicode);
3711 goto onError;
3712 }
3713 *str = (char)c;
3714 }
3715 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003716 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003717 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003718 }
3719 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003721 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003723 /* If this falls res will be NULL */
3724 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 Py_XDECREF(errorHandler);
3726 Py_XDECREF(exc);
3727 return res;
3728
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003729 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 Py_XDECREF(res);
3731 Py_XDECREF(errorHandler);
3732 Py_XDECREF(exc);
3733 return NULL;
3734}
3735
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003737 Py_ssize_t size,
3738 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741}
3742
3743PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3744{
3745 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003746 PyErr_BadArgument();
3747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748 }
3749 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003750 PyUnicode_GET_SIZE(unicode),
3751 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752}
3753
3754/* --- 7-bit ASCII Codec -------------------------------------------------- */
3755
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003757 Py_ssize_t size,
3758 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 PyUnicodeObject *v;
3762 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003763 Py_ssize_t startinpos;
3764 Py_ssize_t endinpos;
3765 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766 const char *e;
3767 PyObject *errorHandler = NULL;
3768 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003769
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003771 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003772 Py_UNICODE r = *(unsigned char*)s;
3773 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003774 }
Tim Petersced69f82003-09-16 20:30:58 +00003775
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 v = _PyUnicode_New(size);
3777 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003778 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003780 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 e = s + size;
3783 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003784 register unsigned char c = (unsigned char)*s;
3785 if (c < 128) {
3786 *p++ = c;
3787 ++s;
3788 }
3789 else {
3790 startinpos = s-starts;
3791 endinpos = startinpos + 1;
3792 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3793 if (unicode_decode_call_errorhandler(
3794 errors, &errorHandler,
3795 "ascii", "ordinal not in range(128)",
3796 starts, size, &startinpos, &endinpos, &exc, &s,
3797 &v, &outpos, &p))
3798 goto onError;
3799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003801 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003802 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3803 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804 Py_XDECREF(errorHandler);
3805 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003807
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003808 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 Py_XDECREF(errorHandler);
3811 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 return NULL;
3813}
3814
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003816 Py_ssize_t size,
3817 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820}
3821
3822PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3823{
3824 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003825 PyErr_BadArgument();
3826 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 }
3828 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003829 PyUnicode_GET_SIZE(unicode),
3830 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831}
3832
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003833#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003834
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003835/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003836
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003837#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003838#define NEED_RETRY
3839#endif
3840
3841/* XXX This code is limited to "true" double-byte encodings, as
3842 a) it assumes an incomplete character consists of a single byte, and
3843 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003844 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003845
3846static int is_dbcs_lead_byte(const char *s, int offset)
3847{
3848 const char *curr = s + offset;
3849
3850 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003851 const char *prev = CharPrev(s, curr);
3852 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003853 }
3854 return 0;
3855}
3856
3857/*
3858 * Decode MBCS string into unicode object. If 'final' is set, converts
3859 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3860 */
3861static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003862 const char *s, /* MBCS string */
3863 int size, /* sizeof MBCS string */
3864 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003865{
3866 Py_UNICODE *p;
3867 Py_ssize_t n = 0;
3868 int usize = 0;
3869
3870 assert(size >= 0);
3871
3872 /* Skip trailing lead-byte unless 'final' is set */
3873 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003874 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003875
3876 /* First get the size of the result */
3877 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003878 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3879 if (usize == 0) {
3880 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3881 return -1;
3882 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003883 }
3884
3885 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003886 /* Create unicode object */
3887 *v = _PyUnicode_New(usize);
3888 if (*v == NULL)
3889 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003890 }
3891 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003892 /* Extend unicode object */
3893 n = PyUnicode_GET_SIZE(*v);
3894 if (_PyUnicode_Resize(v, n + usize) < 0)
3895 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003896 }
3897
3898 /* Do the conversion */
3899 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003900 p = PyUnicode_AS_UNICODE(*v) + n;
3901 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3902 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3903 return -1;
3904 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003905 }
3906
3907 return size;
3908}
3909
3910PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003911 Py_ssize_t size,
3912 const char *errors,
3913 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003914{
3915 PyUnicodeObject *v = NULL;
3916 int done;
3917
3918 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003919 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003920
3921#ifdef NEED_RETRY
3922 retry:
3923 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003924 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003925 else
3926#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003927 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003928
3929 if (done < 0) {
3930 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003931 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003932 }
3933
3934 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003935 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003936
3937#ifdef NEED_RETRY
3938 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003939 s += done;
3940 size -= done;
3941 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003942 }
3943#endif
3944
3945 return (PyObject *)v;
3946}
3947
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003948PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003949 Py_ssize_t size,
3950 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003951{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003952 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3953}
3954
3955/*
3956 * Convert unicode into string object (MBCS).
3957 * Returns 0 if succeed, -1 otherwise.
3958 */
3959static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003960 const Py_UNICODE *p, /* unicode */
3961 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003962{
3963 int mbcssize = 0;
3964 Py_ssize_t n = 0;
3965
3966 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003967
3968 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003969 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003970 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3971 if (mbcssize == 0) {
3972 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3973 return -1;
3974 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003975 }
3976
Martin v. Löwisd8251432006-06-14 05:21:04 +00003977 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003978 /* Create string object */
3979 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3980 if (*repr == NULL)
3981 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003982 }
3983 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003984 /* Extend string object */
3985 n = PyString_Size(*repr);
3986 if (_PyString_Resize(repr, n + mbcssize) < 0)
3987 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003988 }
3989
3990 /* Do the conversion */
3991 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003992 char *s = PyString_AS_STRING(*repr) + n;
3993 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3994 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3995 return -1;
3996 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003997 }
3998
3999 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004000}
4001
4002PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004003 Py_ssize_t size,
4004 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004005{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004006 PyObject *repr = NULL;
4007 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004008
Martin v. Löwisd8251432006-06-14 05:21:04 +00004009#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004010 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004011 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004012 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004013 else
4014#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004015 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004016
Martin v. Löwisd8251432006-06-14 05:21:04 +00004017 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004018 Py_XDECREF(repr);
4019 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004020 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004021
4022#ifdef NEED_RETRY
4023 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004024 p += INT_MAX;
4025 size -= INT_MAX;
4026 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004027 }
4028#endif
4029
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004030 return repr;
4031}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004032
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004033PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4034{
4035 if (!PyUnicode_Check(unicode)) {
4036 PyErr_BadArgument();
4037 return NULL;
4038 }
4039 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004040 PyUnicode_GET_SIZE(unicode),
4041 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004042}
4043
Martin v. Löwisd8251432006-06-14 05:21:04 +00004044#undef NEED_RETRY
4045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004046#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004047
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048/* --- Character Mapping Codec -------------------------------------------- */
4049
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004051 Py_ssize_t size,
4052 PyObject *mapping,
4053 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004056 Py_ssize_t startinpos;
4057 Py_ssize_t endinpos;
4058 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 PyUnicodeObject *v;
4061 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004062 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 PyObject *errorHandler = NULL;
4064 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004065 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004066 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004067
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 /* Default to Latin-1 */
4069 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004070 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071
4072 v = _PyUnicode_New(size);
4073 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004074 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004076 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004079 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004080 mapstring = PyUnicode_AS_UNICODE(mapping);
4081 maplen = PyUnicode_GET_SIZE(mapping);
4082 while (s < e) {
4083 unsigned char ch = *s;
4084 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004086 if (ch < maplen)
4087 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004089 if (x == 0xfffe) {
4090 /* undefined mapping */
4091 outpos = p-PyUnicode_AS_UNICODE(v);
4092 startinpos = s-starts;
4093 endinpos = startinpos+1;
4094 if (unicode_decode_call_errorhandler(
4095 errors, &errorHandler,
4096 "charmap", "character maps to <undefined>",
4097 starts, size, &startinpos, &endinpos, &exc, &s,
4098 &v, &outpos, &p)) {
4099 goto onError;
4100 }
4101 continue;
4102 }
4103 *p++ = x;
4104 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004105 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004106 }
4107 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004108 while (s < e) {
4109 unsigned char ch = *s;
4110 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004111
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004112 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4113 w = PyInt_FromLong((long)ch);
4114 if (w == NULL)
4115 goto onError;
4116 x = PyObject_GetItem(mapping, w);
4117 Py_DECREF(w);
4118 if (x == NULL) {
4119 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4120 /* No mapping found means: mapping is undefined. */
4121 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004122 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004123 } else
4124 goto onError;
4125 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004126
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004127 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004128 if (x == Py_None)
4129 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004130 if (PyInt_Check(x)) {
4131 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004132 if (value == 0xFFFE)
4133 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004134 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004135 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004136 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004137 Py_DECREF(x);
4138 goto onError;
4139 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004140
4141#ifndef Py_UNICODE_WIDE
4142 if (value > 0xFFFF) {
4143 /* see the code for 1-n mapping below */
4144 if (extrachars < 2) {
4145 /* resize first */
4146 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4147 Py_ssize_t needed = 10 - extrachars;
4148 extrachars += needed;
4149 /* XXX overflow detection missing */
4150 if (_PyUnicode_Resize(&v,
4151 PyUnicode_GET_SIZE(v) + needed) < 0) {
4152 Py_DECREF(x);
4153 goto onError;
4154 }
4155 p = PyUnicode_AS_UNICODE(v) + oldpos;
4156 }
4157 value -= 0x10000;
4158 *p++ = 0xD800 | (value >> 10);
4159 *p++ = 0xDC00 | (value & 0x3FF);
4160 extrachars -= 2;
4161 }
4162 else
4163#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004164 *p++ = (Py_UNICODE)value;
4165 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004166 else if (PyUnicode_Check(x)) {
4167 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004168
Serhiy Storchaka95997452013-01-15 14:42:59 +02004169 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004170 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004171 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4172 if (value == 0xFFFE)
4173 goto Undefined;
4174 *p++ = value;
4175 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004176 else if (targetsize > 1) {
4177 /* 1-n mapping */
4178 if (targetsize > extrachars) {
4179 /* resize first */
4180 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4181 Py_ssize_t needed = (targetsize - extrachars) + \
4182 (targetsize << 2);
4183 extrachars += needed;
4184 /* XXX overflow detection missing */
4185 if (_PyUnicode_Resize(&v,
4186 PyUnicode_GET_SIZE(v) + needed) < 0) {
4187 Py_DECREF(x);
4188 goto onError;
4189 }
4190 p = PyUnicode_AS_UNICODE(v) + oldpos;
4191 }
4192 Py_UNICODE_COPY(p,
4193 PyUnicode_AS_UNICODE(x),
4194 targetsize);
4195 p += targetsize;
4196 extrachars -= targetsize;
4197 }
4198 /* 1-0 mapping: skip the character */
4199 }
4200 else {
4201 /* wrong return value */
4202 PyErr_SetString(PyExc_TypeError,
4203 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004204 Py_DECREF(x);
4205 goto onError;
4206 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004207 Py_DECREF(x);
4208 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004209 continue;
4210Undefined:
4211 /* undefined mapping */
4212 Py_XDECREF(x);
4213 outpos = p-PyUnicode_AS_UNICODE(v);
4214 startinpos = s-starts;
4215 endinpos = startinpos+1;
4216 if (unicode_decode_call_errorhandler(
4217 errors, &errorHandler,
4218 "charmap", "character maps to <undefined>",
4219 starts, size, &startinpos, &endinpos, &exc, &s,
4220 &v, &outpos, &p)) {
4221 goto onError;
4222 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 }
4225 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004226 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4227 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 Py_XDECREF(errorHandler);
4229 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004231
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004232 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 Py_XDECREF(errorHandler);
4234 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 Py_XDECREF(v);
4236 return NULL;
4237}
4238
Martin v. Löwis3f767792006-06-04 19:36:28 +00004239/* Charmap encoding: the lookup table */
4240
4241struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004242 PyObject_HEAD
4243 unsigned char level1[32];
4244 int count2, count3;
4245 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004246};
4247
4248static PyObject*
4249encoding_map_size(PyObject *obj, PyObject* args)
4250{
4251 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004252 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004253 128*map->count3);
4254}
4255
4256static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004257 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004258 PyDoc_STR("Return the size (in bytes) of this object") },
4259 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004260};
4261
4262static void
4263encoding_map_dealloc(PyObject* o)
4264{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004265 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004266}
4267
4268static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004269 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004270 "EncodingMap", /*tp_name*/
4271 sizeof(struct encoding_map), /*tp_basicsize*/
4272 0, /*tp_itemsize*/
4273 /* methods */
4274 encoding_map_dealloc, /*tp_dealloc*/
4275 0, /*tp_print*/
4276 0, /*tp_getattr*/
4277 0, /*tp_setattr*/
4278 0, /*tp_compare*/
4279 0, /*tp_repr*/
4280 0, /*tp_as_number*/
4281 0, /*tp_as_sequence*/
4282 0, /*tp_as_mapping*/
4283 0, /*tp_hash*/
4284 0, /*tp_call*/
4285 0, /*tp_str*/
4286 0, /*tp_getattro*/
4287 0, /*tp_setattro*/
4288 0, /*tp_as_buffer*/
4289 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4290 0, /*tp_doc*/
4291 0, /*tp_traverse*/
4292 0, /*tp_clear*/
4293 0, /*tp_richcompare*/
4294 0, /*tp_weaklistoffset*/
4295 0, /*tp_iter*/
4296 0, /*tp_iternext*/
4297 encoding_map_methods, /*tp_methods*/
4298 0, /*tp_members*/
4299 0, /*tp_getset*/
4300 0, /*tp_base*/
4301 0, /*tp_dict*/
4302 0, /*tp_descr_get*/
4303 0, /*tp_descr_set*/
4304 0, /*tp_dictoffset*/
4305 0, /*tp_init*/
4306 0, /*tp_alloc*/
4307 0, /*tp_new*/
4308 0, /*tp_free*/
4309 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004310};
4311
4312PyObject*
4313PyUnicode_BuildEncodingMap(PyObject* string)
4314{
4315 Py_UNICODE *decode;
4316 PyObject *result;
4317 struct encoding_map *mresult;
4318 int i;
4319 int need_dict = 0;
4320 unsigned char level1[32];
4321 unsigned char level2[512];
4322 unsigned char *mlevel1, *mlevel2, *mlevel3;
4323 int count2 = 0, count3 = 0;
4324
4325 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4326 PyErr_BadArgument();
4327 return NULL;
4328 }
4329 decode = PyUnicode_AS_UNICODE(string);
4330 memset(level1, 0xFF, sizeof level1);
4331 memset(level2, 0xFF, sizeof level2);
4332
4333 /* If there isn't a one-to-one mapping of NULL to \0,
4334 or if there are non-BMP characters, we need to use
4335 a mapping dictionary. */
4336 if (decode[0] != 0)
4337 need_dict = 1;
4338 for (i = 1; i < 256; i++) {
4339 int l1, l2;
4340 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004341#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004342 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004343#endif
4344 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004345 need_dict = 1;
4346 break;
4347 }
4348 if (decode[i] == 0xFFFE)
4349 /* unmapped character */
4350 continue;
4351 l1 = decode[i] >> 11;
4352 l2 = decode[i] >> 7;
4353 if (level1[l1] == 0xFF)
4354 level1[l1] = count2++;
4355 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004356 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004357 }
4358
4359 if (count2 >= 0xFF || count3 >= 0xFF)
4360 need_dict = 1;
4361
4362 if (need_dict) {
4363 PyObject *result = PyDict_New();
4364 PyObject *key, *value;
4365 if (!result)
4366 return NULL;
4367 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004368 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004369 key = PyInt_FromLong(decode[i]);
4370 value = PyInt_FromLong(i);
4371 if (!key || !value)
4372 goto failed1;
4373 if (PyDict_SetItem(result, key, value) == -1)
4374 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004375 Py_DECREF(key);
4376 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004377 }
4378 return result;
4379 failed1:
4380 Py_XDECREF(key);
4381 Py_XDECREF(value);
4382 Py_DECREF(result);
4383 return NULL;
4384 }
4385
4386 /* Create a three-level trie */
4387 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4388 16*count2 + 128*count3 - 1);
4389 if (!result)
4390 return PyErr_NoMemory();
4391 PyObject_Init(result, &EncodingMapType);
4392 mresult = (struct encoding_map*)result;
4393 mresult->count2 = count2;
4394 mresult->count3 = count3;
4395 mlevel1 = mresult->level1;
4396 mlevel2 = mresult->level23;
4397 mlevel3 = mresult->level23 + 16*count2;
4398 memcpy(mlevel1, level1, 32);
4399 memset(mlevel2, 0xFF, 16*count2);
4400 memset(mlevel3, 0, 128*count3);
4401 count3 = 0;
4402 for (i = 1; i < 256; i++) {
4403 int o1, o2, o3, i2, i3;
4404 if (decode[i] == 0xFFFE)
4405 /* unmapped character */
4406 continue;
4407 o1 = decode[i]>>11;
4408 o2 = (decode[i]>>7) & 0xF;
4409 i2 = 16*mlevel1[o1] + o2;
4410 if (mlevel2[i2] == 0xFF)
4411 mlevel2[i2] = count3++;
4412 o3 = decode[i] & 0x7F;
4413 i3 = 128*mlevel2[i2] + o3;
4414 mlevel3[i3] = i;
4415 }
4416 return result;
4417}
4418
4419static int
4420encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4421{
4422 struct encoding_map *map = (struct encoding_map*)mapping;
4423 int l1 = c>>11;
4424 int l2 = (c>>7) & 0xF;
4425 int l3 = c & 0x7F;
4426 int i;
4427
4428#ifdef Py_UNICODE_WIDE
4429 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004430 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004431 }
4432#endif
4433 if (c == 0)
4434 return 0;
4435 /* level 1*/
4436 i = map->level1[l1];
4437 if (i == 0xFF) {
4438 return -1;
4439 }
4440 /* level 2*/
4441 i = map->level23[16*i+l2];
4442 if (i == 0xFF) {
4443 return -1;
4444 }
4445 /* level 3 */
4446 i = map->level23[16*map->count2 + 128*i + l3];
4447 if (i == 0) {
4448 return -1;
4449 }
4450 return i;
4451}
4452
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453/* Lookup the character ch in the mapping. If the character
4454 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004455 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 PyObject *w = PyInt_FromLong((long)c);
4459 PyObject *x;
4460
4461 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004462 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463 x = PyObject_GetItem(mapping, w);
4464 Py_DECREF(w);
4465 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004466 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4467 /* No mapping found means: mapping is undefined. */
4468 PyErr_Clear();
4469 x = Py_None;
4470 Py_INCREF(x);
4471 return x;
4472 } else
4473 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004475 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004476 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004478 long value = PyInt_AS_LONG(x);
4479 if (value < 0 || value > 255) {
4480 PyErr_SetString(PyExc_TypeError,
4481 "character mapping must be in range(256)");
4482 Py_DECREF(x);
4483 return NULL;
4484 }
4485 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004487 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004488 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004490 /* wrong return value */
4491 PyErr_SetString(PyExc_TypeError,
4492 "character mapping must return integer, None or str");
4493 Py_DECREF(x);
4494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 }
4496}
4497
Martin v. Löwis3f767792006-06-04 19:36:28 +00004498static int
4499charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4500{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004501 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4502 /* exponentially overallocate to minimize reallocations */
4503 if (requiredsize < 2*outsize)
4504 requiredsize = 2*outsize;
4505 if (_PyString_Resize(outobj, requiredsize)) {
4506 return 0;
4507 }
4508 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004509}
4510
Benjamin Peterson857ce152009-01-31 16:29:18 +00004511typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004512 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004513}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514/* lookup the character, put the result in the output string and adjust
4515 various state variables. Reallocate the output string if not enough
4516 space is available. Return a new reference to the object that
4517 was put in the output buffer, or Py_None, if the mapping was undefined
4518 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004519 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004521charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004522 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004524 PyObject *rep;
4525 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004526 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527
Christian Heimese93237d2007-12-19 02:37:44 +00004528 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004529 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004530 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004531 if (res == -1)
4532 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004533 if (outsize<requiredsize)
4534 if (!charmapencode_resize(outobj, outpos, requiredsize))
4535 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004536 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004537 outstart[(*outpos)++] = (char)res;
4538 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004539 }
4540
4541 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004543 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004544 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004545 Py_DECREF(rep);
4546 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004547 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004548 if (PyInt_Check(rep)) {
4549 Py_ssize_t requiredsize = *outpos+1;
4550 if (outsize<requiredsize)
4551 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4552 Py_DECREF(rep);
4553 return enc_EXCEPTION;
4554 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004555 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004556 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004557 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004558 else {
4559 const char *repchars = PyString_AS_STRING(rep);
4560 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4561 Py_ssize_t requiredsize = *outpos+repsize;
4562 if (outsize<requiredsize)
4563 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4564 Py_DECREF(rep);
4565 return enc_EXCEPTION;
4566 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004567 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004568 memcpy(outstart + *outpos, repchars, repsize);
4569 *outpos += repsize;
4570 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 }
Georg Brandl9f167602006-06-04 21:46:16 +00004572 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004573 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574}
4575
4576/* handle an error in PyUnicode_EncodeCharmap
4577 Return 0 on success, -1 on error */
4578static
4579int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004580 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004582 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004583 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584{
4585 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004586 Py_ssize_t repsize;
4587 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 Py_UNICODE *uni2;
4589 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004590 Py_ssize_t collstartpos = *inpos;
4591 Py_ssize_t collendpos = *inpos+1;
4592 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 char *encoding = "charmap";
4594 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004595 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004597 /* find all unencodable characters */
4598 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004599 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004600 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004601 int res = encoding_map_lookup(p[collendpos], mapping);
4602 if (res != -1)
4603 break;
4604 ++collendpos;
4605 continue;
4606 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004607
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004608 rep = charmapencode_lookup(p[collendpos], mapping);
4609 if (rep==NULL)
4610 return -1;
4611 else if (rep!=Py_None) {
4612 Py_DECREF(rep);
4613 break;
4614 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004615 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004616 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 }
4618 /* cache callback name lookup
4619 * (if not done yet, i.e. it's the first error) */
4620 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004621 if ((errors==NULL) || (!strcmp(errors, "strict")))
4622 *known_errorHandler = 1;
4623 else if (!strcmp(errors, "replace"))
4624 *known_errorHandler = 2;
4625 else if (!strcmp(errors, "ignore"))
4626 *known_errorHandler = 3;
4627 else if (!strcmp(errors, "xmlcharrefreplace"))
4628 *known_errorHandler = 4;
4629 else
4630 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631 }
4632 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004633 case 1: /* strict */
4634 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4635 return -1;
4636 case 2: /* replace */
4637 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004638 x = charmapencode_output('?', mapping, res, respos);
4639 if (x==enc_EXCEPTION) {
4640 return -1;
4641 }
4642 else if (x==enc_FAILED) {
4643 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4644 return -1;
4645 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004646 }
4647 /* fall through */
4648 case 3: /* ignore */
4649 *inpos = collendpos;
4650 break;
4651 case 4: /* xmlcharrefreplace */
4652 /* generate replacement (temporarily (mis)uses p) */
4653 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004654 char buffer[2+29+1+1];
4655 char *cp;
4656 sprintf(buffer, "&#%d;", (int)p[collpos]);
4657 for (cp = buffer; *cp; ++cp) {
4658 x = charmapencode_output(*cp, mapping, res, respos);
4659 if (x==enc_EXCEPTION)
4660 return -1;
4661 else if (x==enc_FAILED) {
4662 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4663 return -1;
4664 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004665 }
4666 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004667 *inpos = collendpos;
4668 break;
4669 default:
4670 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004671 encoding, reason, p, size, exceptionObject,
4672 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004673 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004674 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004675 /* generate replacement */
4676 repsize = PyUnicode_GET_SIZE(repunicode);
4677 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004678 x = charmapencode_output(*uni2, mapping, res, respos);
4679 if (x==enc_EXCEPTION) {
4680 return -1;
4681 }
4682 else if (x==enc_FAILED) {
4683 Py_DECREF(repunicode);
4684 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4685 return -1;
4686 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004687 }
4688 *inpos = newpos;
4689 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 }
4691 return 0;
4692}
4693
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004695 Py_ssize_t size,
4696 PyObject *mapping,
4697 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 /* output object */
4700 PyObject *res = NULL;
4701 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004702 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004704 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004705 PyObject *errorHandler = NULL;
4706 PyObject *exc = NULL;
4707 /* the following variable is used for caching string comparisons
4708 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4709 * 3=ignore, 4=xmlcharrefreplace */
4710 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711
4712 /* Default to Latin-1 */
4713 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004714 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 /* allocate enough for a simple encoding without
4717 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004718 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 if (res == NULL)
4720 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004721 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004722 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004724 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004725 /* try to encode it */
4726 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4727 if (x==enc_EXCEPTION) /* error */
4728 goto onError;
4729 if (x==enc_FAILED) { /* unencodable character */
4730 if (charmap_encoding_error(p, size, &inpos, mapping,
4731 &exc,
4732 &known_errorHandler, &errorHandler, errors,
4733 &res, &respos)) {
4734 goto onError;
4735 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004736 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004737 else
4738 /* done with this character => adjust input position */
4739 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004743 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004744 if (_PyString_Resize(&res, respos))
4745 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004746 }
4747 Py_XDECREF(exc);
4748 Py_XDECREF(errorHandler);
4749 return res;
4750
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004751 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752 Py_XDECREF(res);
4753 Py_XDECREF(exc);
4754 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 return NULL;
4756}
4757
4758PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004759 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760{
4761 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004762 PyErr_BadArgument();
4763 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 }
4765 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004766 PyUnicode_GET_SIZE(unicode),
4767 mapping,
4768 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769}
4770
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771/* create or adjust a UnicodeTranslateError */
4772static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004773 const Py_UNICODE *unicode, Py_ssize_t size,
4774 Py_ssize_t startpos, Py_ssize_t endpos,
4775 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004778 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004779 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 }
4781 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004782 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4783 goto onError;
4784 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4785 goto onError;
4786 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4787 goto onError;
4788 return;
4789 onError:
4790 Py_DECREF(*exceptionObject);
4791 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 }
4793}
4794
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795/* raises a UnicodeTranslateError */
4796static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004797 const Py_UNICODE *unicode, Py_ssize_t size,
4798 Py_ssize_t startpos, Py_ssize_t endpos,
4799 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800{
4801 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004802 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004804 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805}
4806
4807/* error handling callback helper:
4808 build arguments, call the callback and check the arguments,
4809 put the result into newpos and return the replacement string, which
4810 has to be freed by the caller */
4811static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004812 PyObject **errorHandler,
4813 const char *reason,
4814 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4815 Py_ssize_t startpos, Py_ssize_t endpos,
4816 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004818 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004819
Martin v. Löwis412fb672006-04-13 06:34:32 +00004820 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 PyObject *restuple;
4822 PyObject *resunicode;
4823
4824 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004825 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004827 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 }
4829
4830 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004831 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004833 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834
4835 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004836 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004838 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004840 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004841 Py_DECREF(restuple);
4842 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843 }
4844 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004845 &resunicode, &i_newpos)) {
4846 Py_DECREF(restuple);
4847 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004849 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004850 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004851 else
4852 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004853 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004854 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4855 Py_DECREF(restuple);
4856 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004857 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 Py_INCREF(resunicode);
4859 Py_DECREF(restuple);
4860 return resunicode;
4861}
4862
4863/* Lookup the character ch in the mapping and put the result in result,
4864 which must be decrefed by the caller.
4865 Return 0 on success, -1 on error */
4866static
4867int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4868{
4869 PyObject *w = PyInt_FromLong((long)c);
4870 PyObject *x;
4871
4872 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004873 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004874 x = PyObject_GetItem(mapping, w);
4875 Py_DECREF(w);
4876 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004877 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4878 /* No mapping found means: use 1:1 mapping. */
4879 PyErr_Clear();
4880 *result = NULL;
4881 return 0;
4882 } else
4883 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004884 }
4885 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004886 *result = x;
4887 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004888 }
4889 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004890 long value = PyInt_AS_LONG(x);
4891 long max = PyUnicode_GetMax();
4892 if (value < 0 || value > max) {
4893 PyErr_Format(PyExc_TypeError,
4894 "character mapping must be in range(0x%lx)", max+1);
4895 Py_DECREF(x);
4896 return -1;
4897 }
4898 *result = x;
4899 return 0;
4900 }
4901 else if (PyUnicode_Check(x)) {
4902 *result = x;
4903 return 0;
4904 }
4905 else {
4906 /* wrong return value */
4907 PyErr_SetString(PyExc_TypeError,
4908 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004909 Py_DECREF(x);
4910 return -1;
4911 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004912}
4913/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004914 if not reallocate and adjust various state variables.
4915 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916static
Walter Dörwald4894c302003-10-24 14:25:28 +00004917int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004918 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004919{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004920 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004921 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004922 /* remember old output position */
4923 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4924 /* exponentially overallocate to minimize reallocations */
4925 if (requiredsize < 2 * oldsize)
4926 requiredsize = 2 * oldsize;
4927 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4928 return -1;
4929 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 }
4931 return 0;
4932}
4933/* lookup the character, put the result in the output string and adjust
4934 various state variables. Return a new reference to the object that
4935 was put in the output buffer in *result, or Py_None, if the mapping was
4936 undefined (in which case no character was written).
4937 The called must decref result.
4938 Return 0 on success, -1 on error. */
4939static
Walter Dörwald4894c302003-10-24 14:25:28 +00004940int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004941 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4942 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004943{
Walter Dörwald4894c302003-10-24 14:25:28 +00004944 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004945 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004947 /* not found => default to 1:1 mapping */
4948 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 }
4950 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004951 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004953 /* no overflow check, because we know that the space is enough */
4954 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 }
4956 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004957 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4958 if (repsize==1) {
4959 /* no overflow check, because we know that the space is enough */
4960 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4961 }
4962 else if (repsize!=0) {
4963 /* more than one character */
4964 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4965 (insize - (curinp-startinp)) +
4966 repsize - 1;
4967 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4968 return -1;
4969 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4970 *outp += repsize;
4971 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004972 }
4973 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004974 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004975 return 0;
4976}
4977
4978PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004979 Py_ssize_t size,
4980 PyObject *mapping,
4981 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004983 /* output object */
4984 PyObject *res = NULL;
4985 /* pointers to the beginning and end+1 of input */
4986 const Py_UNICODE *startp = p;
4987 const Py_UNICODE *endp = p + size;
4988 /* pointer into the output */
4989 Py_UNICODE *str;
4990 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004991 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 char *reason = "character maps to <undefined>";
4993 PyObject *errorHandler = NULL;
4994 PyObject *exc = NULL;
4995 /* the following variable is used for caching string comparisons
4996 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4997 * 3=ignore, 4=xmlcharrefreplace */
4998 int known_errorHandler = -1;
4999
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005001 PyErr_BadArgument();
5002 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005004
5005 /* allocate enough for a simple 1:1 translation without
5006 replacements, if we need more, we'll resize */
5007 res = PyUnicode_FromUnicode(NULL, size);
5008 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005009 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005011 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005012 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005014 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005015 /* try to encode it */
5016 PyObject *x = NULL;
5017 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5018 Py_XDECREF(x);
5019 goto onError;
5020 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005021 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005022 if (x!=Py_None) /* it worked => adjust input pointer */
5023 ++p;
5024 else { /* untranslatable character */
5025 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5026 Py_ssize_t repsize;
5027 Py_ssize_t newpos;
5028 Py_UNICODE *uni2;
5029 /* startpos for collecting untranslatable chars */
5030 const Py_UNICODE *collstart = p;
5031 const Py_UNICODE *collend = p+1;
5032 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005034 /* find all untranslatable characters */
5035 while (collend < endp) {
5036 if (charmaptranslate_lookup(*collend, mapping, &x))
5037 goto onError;
5038 Py_XDECREF(x);
5039 if (x!=Py_None)
5040 break;
5041 ++collend;
5042 }
5043 /* cache callback name lookup
5044 * (if not done yet, i.e. it's the first error) */
5045 if (known_errorHandler==-1) {
5046 if ((errors==NULL) || (!strcmp(errors, "strict")))
5047 known_errorHandler = 1;
5048 else if (!strcmp(errors, "replace"))
5049 known_errorHandler = 2;
5050 else if (!strcmp(errors, "ignore"))
5051 known_errorHandler = 3;
5052 else if (!strcmp(errors, "xmlcharrefreplace"))
5053 known_errorHandler = 4;
5054 else
5055 known_errorHandler = 0;
5056 }
5057 switch (known_errorHandler) {
5058 case 1: /* strict */
5059 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005060 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005061 case 2: /* replace */
5062 /* No need to check for space, this is a 1:1 replacement */
5063 for (coll = collstart; coll<collend; ++coll)
5064 *str++ = '?';
5065 /* fall through */
5066 case 3: /* ignore */
5067 p = collend;
5068 break;
5069 case 4: /* xmlcharrefreplace */
5070 /* generate replacement (temporarily (mis)uses p) */
5071 for (p = collstart; p < collend; ++p) {
5072 char buffer[2+29+1+1];
5073 char *cp;
5074 sprintf(buffer, "&#%d;", (int)*p);
5075 if (charmaptranslate_makespace(&res, &str,
5076 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5077 goto onError;
5078 for (cp = buffer; *cp; ++cp)
5079 *str++ = *cp;
5080 }
5081 p = collend;
5082 break;
5083 default:
5084 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5085 reason, startp, size, &exc,
5086 collstart-startp, collend-startp, &newpos);
5087 if (repunicode == NULL)
5088 goto onError;
5089 /* generate replacement */
5090 repsize = PyUnicode_GET_SIZE(repunicode);
5091 if (charmaptranslate_makespace(&res, &str,
5092 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5093 Py_DECREF(repunicode);
5094 goto onError;
5095 }
5096 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5097 *str++ = *uni2;
5098 p = startp + newpos;
5099 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005100 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005101 }
5102 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103 /* Resize if we allocated to much */
5104 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005105 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005106 if (PyUnicode_Resize(&res, respos) < 0)
5107 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005108 }
5109 Py_XDECREF(exc);
5110 Py_XDECREF(errorHandler);
5111 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005113 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 Py_XDECREF(res);
5115 Py_XDECREF(exc);
5116 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117 return NULL;
5118}
5119
5120PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005121 PyObject *mapping,
5122 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123{
5124 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005125
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126 str = PyUnicode_FromObject(str);
5127 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005128 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005130 PyUnicode_GET_SIZE(str),
5131 mapping,
5132 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133 Py_DECREF(str);
5134 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005135
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005136 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 Py_XDECREF(str);
5138 return NULL;
5139}
Tim Petersced69f82003-09-16 20:30:58 +00005140
Guido van Rossum9e896b32000-04-05 20:11:21 +00005141/* --- Decimal Encoder ---------------------------------------------------- */
5142
5143int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005144 Py_ssize_t length,
5145 char *output,
5146 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005147{
5148 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149 PyObject *errorHandler = NULL;
5150 PyObject *exc = NULL;
5151 const char *encoding = "decimal";
5152 const char *reason = "invalid decimal Unicode string";
5153 /* the following variable is used for caching string comparisons
5154 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5155 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005156
5157 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005158 PyErr_BadArgument();
5159 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005160 }
5161
5162 p = s;
5163 end = s + length;
5164 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005165 register Py_UNICODE ch = *p;
5166 int decimal;
5167 PyObject *repunicode;
5168 Py_ssize_t repsize;
5169 Py_ssize_t newpos;
5170 Py_UNICODE *uni2;
5171 Py_UNICODE *collstart;
5172 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005173
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005174 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005175 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005176 ++p;
5177 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005178 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005179 decimal = Py_UNICODE_TODECIMAL(ch);
5180 if (decimal >= 0) {
5181 *output++ = '0' + decimal;
5182 ++p;
5183 continue;
5184 }
5185 if (0 < ch && ch < 256) {
5186 *output++ = (char)ch;
5187 ++p;
5188 continue;
5189 }
5190 /* All other characters are considered unencodable */
5191 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005192 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005193 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005194 Py_UNICODE_ISSPACE(*collend) ||
5195 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005196 break;
5197 }
5198 /* cache callback name lookup
5199 * (if not done yet, i.e. it's the first error) */
5200 if (known_errorHandler==-1) {
5201 if ((errors==NULL) || (!strcmp(errors, "strict")))
5202 known_errorHandler = 1;
5203 else if (!strcmp(errors, "replace"))
5204 known_errorHandler = 2;
5205 else if (!strcmp(errors, "ignore"))
5206 known_errorHandler = 3;
5207 else if (!strcmp(errors, "xmlcharrefreplace"))
5208 known_errorHandler = 4;
5209 else
5210 known_errorHandler = 0;
5211 }
5212 switch (known_errorHandler) {
5213 case 1: /* strict */
5214 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5215 goto onError;
5216 case 2: /* replace */
5217 for (p = collstart; p < collend; ++p)
5218 *output++ = '?';
5219 /* fall through */
5220 case 3: /* ignore */
5221 p = collend;
5222 break;
5223 case 4: /* xmlcharrefreplace */
5224 /* generate replacement (temporarily (mis)uses p) */
5225 for (p = collstart; p < collend; ++p)
5226 output += sprintf(output, "&#%d;", (int)*p);
5227 p = collend;
5228 break;
5229 default:
5230 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5231 encoding, reason, s, length, &exc,
5232 collstart-s, collend-s, &newpos);
5233 if (repunicode == NULL)
5234 goto onError;
5235 /* generate replacement */
5236 repsize = PyUnicode_GET_SIZE(repunicode);
5237 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5238 Py_UNICODE ch = *uni2;
5239 if (Py_UNICODE_ISSPACE(ch))
5240 *output++ = ' ';
5241 else {
5242 decimal = Py_UNICODE_TODECIMAL(ch);
5243 if (decimal >= 0)
5244 *output++ = '0' + decimal;
5245 else if (0 < ch && ch < 256)
5246 *output++ = (char)ch;
5247 else {
5248 Py_DECREF(repunicode);
5249 raise_encode_exception(&exc, encoding,
5250 s, length, collstart-s, collend-s, reason);
5251 goto onError;
5252 }
5253 }
5254 }
5255 p = s + newpos;
5256 Py_DECREF(repunicode);
5257 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005258 }
5259 /* 0-terminate the output string */
5260 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005261 Py_XDECREF(exc);
5262 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005263 return 0;
5264
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005265 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005266 Py_XDECREF(exc);
5267 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005268 return -1;
5269}
5270
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271/* --- Helpers ------------------------------------------------------------ */
5272
Eric Smitha9f7d622008-02-17 19:46:49 +00005273#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005274#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005275
5276#include "stringlib/count.h"
5277#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005278#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005279#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005280
Fredrik Lundhc8162812006-05-26 19:33:03 +00005281/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005282#define ADJUST_INDICES(start, end, len) \
5283 if (end > len) \
5284 end = len; \
5285 else if (end < 0) { \
5286 end += len; \
5287 if (end < 0) \
5288 end = 0; \
5289 } \
5290 if (start < 0) { \
5291 start += len; \
5292 if (start < 0) \
5293 start = 0; \
5294 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005295
Martin v. Löwis18e16552006-02-15 17:27:45 +00005296Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005297 PyObject *substr,
5298 Py_ssize_t start,
5299 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005301 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005302 PyUnicodeObject* str_obj;
5303 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005304
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005305 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5306 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005307 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005308 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5309 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005310 Py_DECREF(str_obj);
5311 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 }
Tim Petersced69f82003-09-16 20:30:58 +00005313
Antoine Pitrou64672132010-01-13 07:55:48 +00005314 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005315 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005316 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5317 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005318 );
5319
5320 Py_DECREF(sub_obj);
5321 Py_DECREF(str_obj);
5322
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 return result;
5324}
5325
Martin v. Löwis18e16552006-02-15 17:27:45 +00005326Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005327 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005328 Py_ssize_t start,
5329 Py_ssize_t end,
5330 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005332 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005333
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005334 str = PyUnicode_FromObject(str);
5335 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005336 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005337 sub = PyUnicode_FromObject(sub);
5338 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005339 Py_DECREF(str);
5340 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 }
Tim Petersced69f82003-09-16 20:30:58 +00005342
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005343 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005344 result = stringlib_find_slice(
5345 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5346 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5347 start, end
5348 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005349 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005350 result = stringlib_rfind_slice(
5351 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5352 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5353 start, end
5354 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005355
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005356 Py_DECREF(str);
5357 Py_DECREF(sub);
5358
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 return result;
5360}
5361
Tim Petersced69f82003-09-16 20:30:58 +00005362static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005364 PyUnicodeObject *substring,
5365 Py_ssize_t start,
5366 Py_ssize_t end,
5367 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 if (substring->length == 0)
5370 return 1;
5371
Antoine Pitrou64672132010-01-13 07:55:48 +00005372 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373 end -= substring->length;
5374 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005375 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376
5377 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005378 if (Py_UNICODE_MATCH(self, end, substring))
5379 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 } else {
5381 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005382 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 }
5384
5385 return 0;
5386}
5387
Martin v. Löwis18e16552006-02-15 17:27:45 +00005388Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005389 PyObject *substr,
5390 Py_ssize_t start,
5391 Py_ssize_t end,
5392 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005394 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 str = PyUnicode_FromObject(str);
5397 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005398 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 substr = PyUnicode_FromObject(substr);
5400 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005401 Py_DECREF(str);
5402 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 }
Tim Petersced69f82003-09-16 20:30:58 +00005404
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005406 (PyUnicodeObject *)substr,
5407 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 Py_DECREF(str);
5409 Py_DECREF(substr);
5410 return result;
5411}
5412
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413/* Apply fixfct filter to the Unicode object self and return a
5414 reference to the modified object */
5415
Tim Petersced69f82003-09-16 20:30:58 +00005416static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005418 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419{
5420
5421 PyUnicodeObject *u;
5422
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005423 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005425 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005426
5427 Py_UNICODE_COPY(u->str, self->str, self->length);
5428
Tim Peters7a29bd52001-09-12 03:03:31 +00005429 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005430 /* fixfct should return TRUE if it modified the buffer. If
5431 FALSE, return a reference to the original buffer instead
5432 (to save space, not time) */
5433 Py_INCREF(self);
5434 Py_DECREF(u);
5435 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 }
5437 return (PyObject*) u;
5438}
5439
Tim Petersced69f82003-09-16 20:30:58 +00005440static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441int fixupper(PyUnicodeObject *self)
5442{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005443 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 Py_UNICODE *s = self->str;
5445 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005446
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005448 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005449
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005450 ch = Py_UNICODE_TOUPPER(*s);
5451 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005453 *s = ch;
5454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 s++;
5456 }
5457
5458 return status;
5459}
5460
Tim Petersced69f82003-09-16 20:30:58 +00005461static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462int fixlower(PyUnicodeObject *self)
5463{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005464 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465 Py_UNICODE *s = self->str;
5466 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005467
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005469 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005470
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005471 ch = Py_UNICODE_TOLOWER(*s);
5472 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005474 *s = ch;
5475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 s++;
5477 }
5478
5479 return status;
5480}
5481
Tim Petersced69f82003-09-16 20:30:58 +00005482static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483int fixswapcase(PyUnicodeObject *self)
5484{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005485 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 Py_UNICODE *s = self->str;
5487 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005488
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 while (len-- > 0) {
5490 if (Py_UNICODE_ISUPPER(*s)) {
5491 *s = Py_UNICODE_TOLOWER(*s);
5492 status = 1;
5493 } else if (Py_UNICODE_ISLOWER(*s)) {
5494 *s = Py_UNICODE_TOUPPER(*s);
5495 status = 1;
5496 }
5497 s++;
5498 }
5499
5500 return status;
5501}
5502
Tim Petersced69f82003-09-16 20:30:58 +00005503static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504int fixcapitalize(PyUnicodeObject *self)
5505{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005506 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005507 Py_UNICODE *s = self->str;
5508 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005509
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005510 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005511 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005512 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005513 *s = Py_UNICODE_TOUPPER(*s);
5514 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005516 s++;
5517 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005518 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005519 *s = Py_UNICODE_TOLOWER(*s);
5520 status = 1;
5521 }
5522 s++;
5523 }
5524 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525}
5526
5527static
5528int fixtitle(PyUnicodeObject *self)
5529{
5530 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5531 register Py_UNICODE *e;
5532 int previous_is_cased;
5533
5534 /* Shortcut for single character strings */
5535 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005536 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5537 if (*p != ch) {
5538 *p = ch;
5539 return 1;
5540 }
5541 else
5542 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 }
Tim Petersced69f82003-09-16 20:30:58 +00005544
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 e = p + PyUnicode_GET_SIZE(self);
5546 previous_is_cased = 0;
5547 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005548 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005549
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005550 if (previous_is_cased)
5551 *p = Py_UNICODE_TOLOWER(ch);
5552 else
5553 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005554
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005555 if (Py_UNICODE_ISLOWER(ch) ||
5556 Py_UNICODE_ISUPPER(ch) ||
5557 Py_UNICODE_ISTITLE(ch))
5558 previous_is_cased = 1;
5559 else
5560 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 }
5562 return 1;
5563}
5564
Tim Peters8ce9f162004-08-27 01:49:32 +00005565PyObject *
5566PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567{
Tim Peters8ce9f162004-08-27 01:49:32 +00005568 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005569 const Py_UNICODE blank = ' ';
5570 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005571 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005572 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005573 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5574 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005575 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5576 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005577 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005578 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005579 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580
Tim Peters05eba1f2004-08-27 21:32:02 +00005581 fseq = PySequence_Fast(seq, "");
5582 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005583 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005584 }
5585
Tim Peters91879ab2004-08-27 22:35:44 +00005586 /* Grrrr. A codec may be invoked to convert str objects to
5587 * Unicode, and so it's possible to call back into Python code
5588 * during PyUnicode_FromObject(), and so it's possible for a sick
5589 * codec to change the size of fseq (if seq is a list). Therefore
5590 * we have to keep refetching the size -- can't assume seqlen
5591 * is invariant.
5592 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005593 seqlen = PySequence_Fast_GET_SIZE(fseq);
5594 /* If empty sequence, return u"". */
5595 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005596 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5597 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005598 }
5599 /* If singleton sequence with an exact Unicode, return that. */
5600 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005601 item = PySequence_Fast_GET_ITEM(fseq, 0);
5602 if (PyUnicode_CheckExact(item)) {
5603 Py_INCREF(item);
5604 res = (PyUnicodeObject *)item;
5605 goto Done;
5606 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005607 }
5608
Tim Peters05eba1f2004-08-27 21:32:02 +00005609 /* At least two items to join, or one that isn't exact Unicode. */
5610 if (seqlen > 1) {
5611 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005612 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005613 sep = &blank;
5614 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005615 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005616 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005617 internal_separator = PyUnicode_FromObject(separator);
5618 if (internal_separator == NULL)
5619 goto onError;
5620 sep = PyUnicode_AS_UNICODE(internal_separator);
5621 seplen = PyUnicode_GET_SIZE(internal_separator);
5622 /* In case PyUnicode_FromObject() mutated seq. */
5623 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005624 }
5625 }
5626
5627 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005628 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005629 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005630 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005631 res_p = PyUnicode_AS_UNICODE(res);
5632 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005633
Tim Peters05eba1f2004-08-27 21:32:02 +00005634 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005635 Py_ssize_t itemlen;
5636 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005637
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005638 item = PySequence_Fast_GET_ITEM(fseq, i);
5639 /* Convert item to Unicode. */
5640 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5641 PyErr_Format(PyExc_TypeError,
5642 "sequence item %zd: expected string or Unicode,"
5643 " %.80s found",
5644 i, Py_TYPE(item)->tp_name);
5645 goto onError;
5646 }
5647 item = PyUnicode_FromObject(item);
5648 if (item == NULL)
5649 goto onError;
5650 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005651
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005652 /* In case PyUnicode_FromObject() mutated seq. */
5653 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005654
Tim Peters8ce9f162004-08-27 01:49:32 +00005655 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005656 itemlen = PyUnicode_GET_SIZE(item);
5657 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005658 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005659 goto Overflow;
5660 if (i < seqlen - 1) {
5661 new_res_used += seplen;
5662 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005663 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005664 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005665 if (new_res_used > res_alloc) {
5666 /* double allocated size until it's big enough */
5667 do {
5668 res_alloc += res_alloc;
5669 if (res_alloc <= 0)
5670 goto Overflow;
5671 } while (new_res_used > res_alloc);
5672 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5673 Py_DECREF(item);
5674 goto onError;
5675 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005676 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005677 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005678
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005679 /* Copy item, and maybe the separator. */
5680 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5681 res_p += itemlen;
5682 if (i < seqlen - 1) {
5683 Py_UNICODE_COPY(res_p, sep, seplen);
5684 res_p += seplen;
5685 }
5686 Py_DECREF(item);
5687 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005688 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005689
Tim Peters05eba1f2004-08-27 21:32:02 +00005690 /* Shrink res to match the used area; this probably can't fail,
5691 * but it's cheap to check.
5692 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005693 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005694 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005695
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005696 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005697 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005698 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 return (PyObject *)res;
5700
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005701 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005702 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005703 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005704 Py_DECREF(item);
5705 /* fall through */
5706
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005707 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005708 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005709 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005710 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 return NULL;
5712}
5713
Tim Petersced69f82003-09-16 20:30:58 +00005714static
5715PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005716 Py_ssize_t left,
5717 Py_ssize_t right,
5718 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719{
5720 PyUnicodeObject *u;
5721
5722 if (left < 0)
5723 left = 0;
5724 if (right < 0)
5725 right = 0;
5726
Tim Peters7a29bd52001-09-12 03:03:31 +00005727 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 Py_INCREF(self);
5729 return self;
5730 }
5731
Neal Norwitze7d8be82008-07-31 17:17:14 +00005732 if (left > PY_SSIZE_T_MAX - self->length ||
5733 right > PY_SSIZE_T_MAX - (left + self->length)) {
5734 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5735 return NULL;
5736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 u = _PyUnicode_New(left + self->length + right);
5738 if (u) {
5739 if (left)
5740 Py_UNICODE_FILL(u->str, fill, left);
5741 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5742 if (right)
5743 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5744 }
5745
5746 return u;
5747}
5748
Antoine Pitrou64672132010-01-13 07:55:48 +00005749PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752
5753 string = PyUnicode_FromObject(string);
5754 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005755 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756
Antoine Pitrou64672132010-01-13 07:55:48 +00005757 list = stringlib_splitlines(
5758 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5759 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760
5761 Py_DECREF(string);
5762 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763}
5764
Tim Petersced69f82003-09-16 20:30:58 +00005765static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005767 PyUnicodeObject *substring,
5768 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005771 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005774 return stringlib_split_whitespace(
5775 (PyObject*) self, self->str, self->length, maxcount
5776 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777
Antoine Pitrou64672132010-01-13 07:55:48 +00005778 return stringlib_split(
5779 (PyObject*) self, self->str, self->length,
5780 substring->str, substring->length,
5781 maxcount
5782 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783}
5784
Tim Petersced69f82003-09-16 20:30:58 +00005785static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005786PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005787 PyUnicodeObject *substring,
5788 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005789{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005790 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005791 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005792
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005793 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005794 return stringlib_rsplit_whitespace(
5795 (PyObject*) self, self->str, self->length, maxcount
5796 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005797
Antoine Pitrou64672132010-01-13 07:55:48 +00005798 return stringlib_rsplit(
5799 (PyObject*) self, self->str, self->length,
5800 substring->str, substring->length,
5801 maxcount
5802 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005803}
5804
5805static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005807 PyUnicodeObject *str1,
5808 PyUnicodeObject *str2,
5809 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810{
5811 PyUnicodeObject *u;
5812
5813 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005814 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005815 else if (maxcount == 0 || self->length == 0)
5816 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
Fredrik Lundh347ee272006-05-24 16:35:18 +00005818 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005819 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005820 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005821 if (str1->length == 0)
5822 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005823 if (str1->length == 1) {
5824 /* replace characters */
5825 Py_UNICODE u1, u2;
5826 if (!findchar(self->str, self->length, str1->str[0]))
5827 goto nothing;
5828 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5829 if (!u)
5830 return NULL;
5831 Py_UNICODE_COPY(u->str, self->str, self->length);
5832 u1 = str1->str[0];
5833 u2 = str2->str[0];
5834 for (i = 0; i < u->length; i++)
5835 if (u->str[i] == u1) {
5836 if (--maxcount < 0)
5837 break;
5838 u->str[i] = u2;
5839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005841 i = stringlib_find(
5842 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005844 if (i < 0)
5845 goto nothing;
5846 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5847 if (!u)
5848 return NULL;
5849 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005850
5851 /* change everything in-place, starting with this one */
5852 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5853 i += str1->length;
5854
5855 while ( --maxcount > 0) {
5856 i = stringlib_find(self->str+i, self->length-i,
5857 str1->str, str1->length,
5858 i);
5859 if (i == -1)
5860 break;
5861 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5862 i += str1->length;
5863 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005866
Brett Cannona7f13ee2010-05-04 01:16:51 +00005867 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005868 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 Py_UNICODE *p;
5870
5871 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005872 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5873 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005874 if (n == 0)
5875 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005876 /* new_size = self->length + n * (str2->length - str1->length)); */
5877 delta = (str2->length - str1->length);
5878 if (delta == 0) {
5879 new_size = self->length;
5880 } else {
5881 product = n * (str2->length - str1->length);
5882 if ((product / (str2->length - str1->length)) != n) {
5883 PyErr_SetString(PyExc_OverflowError,
5884 "replace string is too long");
5885 return NULL;
5886 }
5887 new_size = self->length + product;
5888 if (new_size < 0) {
5889 PyErr_SetString(PyExc_OverflowError,
5890 "replace string is too long");
5891 return NULL;
5892 }
5893 }
5894 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005895 if (!u)
5896 return NULL;
5897 i = 0;
5898 p = u->str;
5899 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005900 while (n-- > 0) {
5901 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005902 j = stringlib_find(self->str+i, self->length-i,
5903 str1->str, str1->length,
5904 i);
5905 if (j == -1)
5906 break;
5907 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005908 /* copy unchanged part [i:j] */
5909 Py_UNICODE_COPY(p, self->str+i, j-i);
5910 p += j - i;
5911 }
5912 /* copy substitution string */
5913 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005914 Py_UNICODE_COPY(p, str2->str, str2->length);
5915 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005916 }
5917 i = j + str1->length;
5918 }
5919 if (i < self->length)
5920 /* copy tail [i:] */
5921 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005922 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005923 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005924 while (n > 0) {
5925 Py_UNICODE_COPY(p, str2->str, str2->length);
5926 p += str2->length;
5927 if (--n <= 0)
5928 break;
5929 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005931 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 }
5933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005935
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005936 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005937 /* nothing to replace; return original string (when possible) */
5938 if (PyUnicode_CheckExact(self)) {
5939 Py_INCREF(self);
5940 return (PyObject *) self;
5941 }
5942 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943}
5944
5945/* --- Unicode Object Methods --------------------------------------------- */
5946
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005947PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005948 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949\n\
5950Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005951characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
5953static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005954unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 return fixup(self, fixtitle);
5957}
5958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005959PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005960 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961\n\
5962Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00005963have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964
5965static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005966unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 return fixup(self, fixcapitalize);
5969}
5970
5971#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005972PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005973 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974\n\
5975Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005976normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
5978static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005979unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980{
5981 PyObject *list;
5982 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005983 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 /* Split into words */
5986 list = split(self, NULL, -1);
5987 if (!list)
5988 return NULL;
5989
5990 /* Capitalize each word */
5991 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5992 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005993 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 if (item == NULL)
5995 goto onError;
5996 Py_DECREF(PyList_GET_ITEM(list, i));
5997 PyList_SET_ITEM(list, i, item);
5998 }
5999
6000 /* Join the words to form a new string */
6001 item = PyUnicode_Join(NULL, list);
6002
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006003 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 Py_DECREF(list);
6005 return (PyObject *)item;
6006}
6007#endif
6008
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006009/* Argument converter. Coerces to a single unicode character */
6010
6011static int
6012convert_uc(PyObject *obj, void *addr)
6013{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006014 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6015 PyObject *uniobj;
6016 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006017
Benjamin Peterson857ce152009-01-31 16:29:18 +00006018 uniobj = PyUnicode_FromObject(obj);
6019 if (uniobj == NULL) {
6020 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006021 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006022 return 0;
6023 }
6024 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6025 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006026 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006027 Py_DECREF(uniobj);
6028 return 0;
6029 }
6030 unistr = PyUnicode_AS_UNICODE(uniobj);
6031 *fillcharloc = unistr[0];
6032 Py_DECREF(uniobj);
6033 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006034}
6035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006036PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006037 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006039Return S centered in a Unicode string of length width. Padding is\n\
6040done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041
6042static PyObject *
6043unicode_center(PyUnicodeObject *self, PyObject *args)
6044{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006045 Py_ssize_t marg, left;
6046 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006047 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048
Thomas Woutersde017742006-02-16 19:34:37 +00006049 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 return NULL;
6051
Tim Peters7a29bd52001-09-12 03:03:31 +00006052 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 Py_INCREF(self);
6054 return (PyObject*) self;
6055 }
6056
6057 marg = width - self->length;
6058 left = marg / 2 + (marg & width & 1);
6059
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006060 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061}
6062
Marc-André Lemburge5034372000-08-08 08:04:29 +00006063#if 0
6064
6065/* This code should go into some future Unicode collation support
6066 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006067 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006068
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006069/* speedy UTF-16 code point order comparison */
6070/* gleaned from: */
6071/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6072
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006073static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006074{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006075 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006076 0, 0, 0, 0, 0, 0, 0, 0,
6077 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006078 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006079};
6080
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081static int
6082unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6083{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006084 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006085
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 Py_UNICODE *s1 = str1->str;
6087 Py_UNICODE *s2 = str2->str;
6088
6089 len1 = str1->length;
6090 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006091
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006093 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006094
6095 c1 = *s1++;
6096 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006097
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006098 if (c1 > (1<<11) * 26)
6099 c1 += utf16Fixup[c1>>11];
6100 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006101 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006102 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006103
6104 if (c1 != c2)
6105 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006106
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006107 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 }
6109
6110 return (len1 < len2) ? -1 : (len1 != len2);
6111}
6112
Marc-André Lemburge5034372000-08-08 08:04:29 +00006113#else
6114
6115static int
6116unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6117{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006118 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006119
6120 Py_UNICODE *s1 = str1->str;
6121 Py_UNICODE *s2 = str2->str;
6122
6123 len1 = str1->length;
6124 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006125
Marc-André Lemburge5034372000-08-08 08:04:29 +00006126 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006127 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006128
Fredrik Lundh45714e92001-06-26 16:39:36 +00006129 c1 = *s1++;
6130 c2 = *s2++;
6131
6132 if (c1 != c2)
6133 return (c1 < c2) ? -1 : 1;
6134
Marc-André Lemburge5034372000-08-08 08:04:29 +00006135 len1--; len2--;
6136 }
6137
6138 return (len1 < len2) ? -1 : (len1 != len2);
6139}
6140
6141#endif
6142
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006144 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145{
6146 PyUnicodeObject *u = NULL, *v = NULL;
6147 int result;
6148
6149 /* Coerce the two arguments */
6150 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6151 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006152 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6154 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006155 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
Thomas Wouters7e474022000-07-16 12:04:32 +00006157 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006159 Py_DECREF(u);
6160 Py_DECREF(v);
6161 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 }
6163
6164 result = unicode_compare(u, v);
6165
6166 Py_DECREF(u);
6167 Py_DECREF(v);
6168 return result;
6169
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006170 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 Py_XDECREF(u);
6172 Py_XDECREF(v);
6173 return -1;
6174}
6175
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006176PyObject *PyUnicode_RichCompare(PyObject *left,
6177 PyObject *right,
6178 int op)
6179{
6180 int result;
6181
6182 result = PyUnicode_Compare(left, right);
6183 if (result == -1 && PyErr_Occurred())
6184 goto onError;
6185
6186 /* Convert the return value to a Boolean */
6187 switch (op) {
6188 case Py_EQ:
6189 result = (result == 0);
6190 break;
6191 case Py_NE:
6192 result = (result != 0);
6193 break;
6194 case Py_LE:
6195 result = (result <= 0);
6196 break;
6197 case Py_GE:
6198 result = (result >= 0);
6199 break;
6200 case Py_LT:
6201 result = (result == -1);
6202 break;
6203 case Py_GT:
6204 result = (result == 1);
6205 break;
6206 }
6207 return PyBool_FromLong(result);
6208
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006209 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006210
6211 /* Standard case
6212
6213 Type errors mean that PyUnicode_FromObject() could not convert
6214 one of the arguments (usually the right hand side) to Unicode,
6215 ie. we can't handle the comparison request. However, it is
6216 possible that the other object knows a comparison method, which
6217 is why we return Py_NotImplemented to give the other object a
6218 chance.
6219
6220 */
6221 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6222 PyErr_Clear();
6223 Py_INCREF(Py_NotImplemented);
6224 return Py_NotImplemented;
6225 }
6226 if (op != Py_EQ && op != Py_NE)
6227 return NULL;
6228
6229 /* Equality comparison.
6230
6231 This is a special case: we silence any PyExc_UnicodeDecodeError
6232 and instead turn it into a PyErr_UnicodeWarning.
6233
6234 */
6235 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6236 return NULL;
6237 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006238 if (PyErr_Warn(PyExc_UnicodeWarning,
6239 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006240 "Unicode equal comparison "
6241 "failed to convert both arguments to Unicode - "
6242 "interpreting them as being unequal" :
6243 "Unicode unequal comparison "
6244 "failed to convert both arguments to Unicode - "
6245 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006246 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006247 return NULL;
6248 result = (op == Py_NE);
6249 return PyBool_FromLong(result);
6250}
6251
Guido van Rossum403d68b2000-03-13 15:55:09 +00006252int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006253 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006254{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006255 PyObject *str, *sub;
6256 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006257
6258 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006259 sub = PyUnicode_FromObject(element);
6260 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006261 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006262 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006263
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006264 str = PyUnicode_FromObject(container);
6265 if (!str) {
6266 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006267 return -1;
6268 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006269
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006270 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006271
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006272 Py_DECREF(str);
6273 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006274
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006275 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006276}
6277
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278/* Concat to string or Unicode object giving a new Unicode object. */
6279
6280PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006281 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282{
6283 PyUnicodeObject *u = NULL, *v = NULL, *w;
6284
6285 /* Coerce the two arguments */
6286 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6287 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006288 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6290 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006291 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292
6293 /* Shortcuts */
6294 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006295 Py_DECREF(v);
6296 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 }
6298 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006299 Py_DECREF(u);
6300 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 }
6302
6303 /* Concat the two Unicode strings */
6304 w = _PyUnicode_New(u->length + v->length);
6305 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006306 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 Py_UNICODE_COPY(w->str, u->str, u->length);
6308 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6309
6310 Py_DECREF(u);
6311 Py_DECREF(v);
6312 return (PyObject *)w;
6313
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006314 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 Py_XDECREF(u);
6316 Py_XDECREF(v);
6317 return NULL;
6318}
6319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006320PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006321 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006323Return the number of non-overlapping occurrences of substring sub in\n\
6324Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006325interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326
6327static PyObject *
6328unicode_count(PyUnicodeObject *self, PyObject *args)
6329{
6330 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006331 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006332 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 PyObject *result;
6334
Jesus Cea44e81682011-04-20 16:39:15 +02006335 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6336 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006337 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006338
Antoine Pitrou64672132010-01-13 07:55:48 +00006339 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006340 result = PyInt_FromSsize_t(
6341 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006342 substring->str, substring->length,
6343 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006344 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345
6346 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006347
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 return result;
6349}
6350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006351PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006352 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006354Encodes S using the codec registered for encoding. encoding defaults\n\
6355to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006356handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006357a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6358'xmlcharrefreplace' as well as any other name registered with\n\
6359codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360
6361static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006362unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006364 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365 char *encoding = NULL;
6366 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006367 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006368
Benjamin Peterson332d7212009-09-18 21:14:55 +00006369 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6370 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006372 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006373 if (v == NULL)
6374 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006375 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006376 PyErr_Format(PyExc_TypeError,
6377 "encoder did not return a string/unicode object "
6378 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006379 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006380 Py_DECREF(v);
6381 return NULL;
6382 }
6383 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006384
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006385 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006386 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006387}
6388
6389PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006390 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006391\n\
6392Decodes S using the codec registered for encoding. encoding defaults\n\
6393to the default encoding. errors may be given to set a different error\n\
6394handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6395a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006396as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006397able to handle UnicodeDecodeErrors.");
6398
6399static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006400unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006401{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006402 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006403 char *encoding = NULL;
6404 char *errors = NULL;
6405 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006406
Benjamin Peterson332d7212009-09-18 21:14:55 +00006407 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6408 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006409 return NULL;
6410 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006411 if (v == NULL)
6412 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006413 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006414 PyErr_Format(PyExc_TypeError,
6415 "decoder did not return a string/unicode object "
6416 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006417 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006418 Py_DECREF(v);
6419 return NULL;
6420 }
6421 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006422
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006423 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006424 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425}
6426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006427PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006428 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429\n\
6430Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006431If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432
6433static PyObject*
6434unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6435{
6436 Py_UNICODE *e;
6437 Py_UNICODE *p;
6438 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006439 Py_UNICODE *qe;
6440 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 PyUnicodeObject *u;
6442 int tabsize = 8;
6443
6444 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006445 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446
Thomas Wouters7e474022000-07-16 12:04:32 +00006447 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006448 i = 0; /* chars up to and including most recent \n or \r */
6449 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6450 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 for (p = self->str; p < e; p++)
6452 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006453 if (tabsize > 0) {
6454 incr = tabsize - (j % tabsize); /* cannot overflow */
6455 if (j > PY_SSIZE_T_MAX - incr)
6456 goto overflow1;
6457 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006458 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006459 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006461 if (j > PY_SSIZE_T_MAX - 1)
6462 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 j++;
6464 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006465 if (i > PY_SSIZE_T_MAX - j)
6466 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006468 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 }
6470 }
6471
Guido van Rossum5bdff602008-03-11 21:18:06 +00006472 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006473 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006474
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 /* Second pass: create output string and fill it */
6476 u = _PyUnicode_New(i + j);
6477 if (!u)
6478 return NULL;
6479
Guido van Rossum5bdff602008-03-11 21:18:06 +00006480 j = 0; /* same as in first pass */
6481 q = u->str; /* next output char */
6482 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483
6484 for (p = self->str; p < e; p++)
6485 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006486 if (tabsize > 0) {
6487 i = tabsize - (j % tabsize);
6488 j += i;
6489 while (i--) {
6490 if (q >= qe)
6491 goto overflow2;
6492 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006493 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006494 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006495 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006496 else {
6497 if (q >= qe)
6498 goto overflow2;
6499 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006500 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 if (*p == '\n' || *p == '\r')
6502 j = 0;
6503 }
6504
6505 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006506
6507 overflow2:
6508 Py_DECREF(u);
6509 overflow1:
6510 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6511 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512}
6513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006514PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006515 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516\n\
6517Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006518such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519arguments start and end are interpreted as in slice notation.\n\
6520\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006521Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522
6523static PyObject *
6524unicode_find(PyUnicodeObject *self, PyObject *args)
6525{
Jesus Cea44e81682011-04-20 16:39:15 +02006526 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006527 Py_ssize_t start;
6528 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006529 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530
Jesus Cea44e81682011-04-20 16:39:15 +02006531 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6532 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006535 result = stringlib_find_slice(
6536 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6537 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6538 start, end
6539 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540
6541 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006542
6543 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544}
6545
6546static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006547unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548{
6549 if (index < 0 || index >= self->length) {
6550 PyErr_SetString(PyExc_IndexError, "string index out of range");
6551 return NULL;
6552 }
6553
6554 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6555}
6556
6557static long
6558unicode_hash(PyUnicodeObject *self)
6559{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006560 /* Since Unicode objects compare equal to their ASCII string
6561 counterparts, they should use the individual character values
6562 as basis for their hash value. This is needed to assure that
6563 strings and Unicode objects behave in the same way as
6564 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565
Martin v. Löwis18e16552006-02-15 17:27:45 +00006566 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006567 register Py_UNICODE *p;
6568 register long x;
6569
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006570#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006571 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006572#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006574 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006575 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006576 /*
6577 We make the hash of the empty string be 0, rather than using
6578 (prefix ^ suffix), since this slightly obfuscates the hash secret
6579 */
6580 if (len == 0) {
6581 self->hash = 0;
6582 return 0;
6583 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006584 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006585 x = _Py_HashSecret.prefix;
6586 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006587 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006588 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006589 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006590 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006591 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006592 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006593 self->hash = x;
6594 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595}
6596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006597PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006598 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006600Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
6602static PyObject *
6603unicode_index(PyUnicodeObject *self, PyObject *args)
6604{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006605 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006606 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006607 Py_ssize_t start;
6608 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609
Jesus Cea44e81682011-04-20 16:39:15 +02006610 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6611 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006614 result = stringlib_find_slice(
6615 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6616 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6617 start, end
6618 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619
6620 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006621
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 if (result < 0) {
6623 PyErr_SetString(PyExc_ValueError, "substring not found");
6624 return NULL;
6625 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006626
Martin v. Löwis18e16552006-02-15 17:27:45 +00006627 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628}
6629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006630PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006631 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006633Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006634at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635
6636static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006637unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638{
6639 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6640 register const Py_UNICODE *e;
6641 int cased;
6642
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 /* Shortcut for single character strings */
6644 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006645 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006647 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006648 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006649 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006650
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 e = p + PyUnicode_GET_SIZE(self);
6652 cased = 0;
6653 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006654 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006655
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006656 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6657 return PyBool_FromLong(0);
6658 else if (!cased && Py_UNICODE_ISLOWER(ch))
6659 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006661 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662}
6663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006664PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006665 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006667Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006668at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669
6670static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006671unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672{
6673 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6674 register const Py_UNICODE *e;
6675 int cased;
6676
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 /* Shortcut for single character strings */
6678 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006679 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006681 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006682 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006683 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006684
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 e = p + PyUnicode_GET_SIZE(self);
6686 cased = 0;
6687 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006688 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006689
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006690 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6691 return PyBool_FromLong(0);
6692 else if (!cased && Py_UNICODE_ISUPPER(ch))
6693 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006695 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696}
6697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006698PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006699 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006701Return True if S is a titlecased string and there is at least one\n\
6702character in S, i.e. upper- and titlecase characters may only\n\
6703follow uncased characters and lowercase characters only cased ones.\n\
6704Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705
6706static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006707unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708{
6709 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6710 register const Py_UNICODE *e;
6711 int cased, previous_is_cased;
6712
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 /* Shortcut for single character strings */
6714 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006715 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6716 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006718 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006719 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006720 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006721
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 e = p + PyUnicode_GET_SIZE(self);
6723 cased = 0;
6724 previous_is_cased = 0;
6725 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006726 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006727
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006728 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6729 if (previous_is_cased)
6730 return PyBool_FromLong(0);
6731 previous_is_cased = 1;
6732 cased = 1;
6733 }
6734 else if (Py_UNICODE_ISLOWER(ch)) {
6735 if (!previous_is_cased)
6736 return PyBool_FromLong(0);
6737 previous_is_cased = 1;
6738 cased = 1;
6739 }
6740 else
6741 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006743 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744}
6745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006746PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006747 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006749Return True if all characters in S are whitespace\n\
6750and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751
6752static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006753unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754{
6755 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6756 register const Py_UNICODE *e;
6757
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 /* Shortcut for single character strings */
6759 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006760 Py_UNICODE_ISSPACE(*p))
6761 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006763 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006764 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006765 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006766
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 e = p + PyUnicode_GET_SIZE(self);
6768 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006769 if (!Py_UNICODE_ISSPACE(*p))
6770 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006772 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773}
6774
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006775PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006776 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006777\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006778Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006779and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006780
6781static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006782unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006783{
6784 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6785 register const Py_UNICODE *e;
6786
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006787 /* Shortcut for single character strings */
6788 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006789 Py_UNICODE_ISALPHA(*p))
6790 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006791
6792 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006793 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006794 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006795
6796 e = p + PyUnicode_GET_SIZE(self);
6797 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006798 if (!Py_UNICODE_ISALPHA(*p))
6799 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006800 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006801 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006802}
6803
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006804PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006805 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006806\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006807Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006808and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006809
6810static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006811unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006812{
6813 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6814 register const Py_UNICODE *e;
6815
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006816 /* Shortcut for single character strings */
6817 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006818 Py_UNICODE_ISALNUM(*p))
6819 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006820
6821 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006822 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006823 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006824
6825 e = p + PyUnicode_GET_SIZE(self);
6826 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006827 if (!Py_UNICODE_ISALNUM(*p))
6828 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006829 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006830 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006831}
6832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006833PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006834 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006836Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006837False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838
6839static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006840unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841{
6842 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6843 register const Py_UNICODE *e;
6844
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 /* Shortcut for single character strings */
6846 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006847 Py_UNICODE_ISDECIMAL(*p))
6848 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006850 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006851 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006852 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006853
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 e = p + PyUnicode_GET_SIZE(self);
6855 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006856 if (!Py_UNICODE_ISDECIMAL(*p))
6857 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006859 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860}
6861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006862PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006863 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006865Return True if all characters in S are digits\n\
6866and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867
6868static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006869unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870{
6871 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6872 register const Py_UNICODE *e;
6873
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874 /* Shortcut for single character strings */
6875 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006876 Py_UNICODE_ISDIGIT(*p))
6877 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006879 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006880 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006881 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006882
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 e = p + PyUnicode_GET_SIZE(self);
6884 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006885 if (!Py_UNICODE_ISDIGIT(*p))
6886 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006888 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889}
6890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006891PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006892 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006894Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006895False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
6897static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006898unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899{
6900 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6901 register const Py_UNICODE *e;
6902
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 /* Shortcut for single character strings */
6904 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006905 Py_UNICODE_ISNUMERIC(*p))
6906 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006908 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006909 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006910 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006911
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 e = p + PyUnicode_GET_SIZE(self);
6913 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006914 if (!Py_UNICODE_ISNUMERIC(*p))
6915 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006917 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918}
6919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006920PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006921 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922\n\
6923Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006924iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925
6926static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006927unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006929 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930}
6931
Martin v. Löwis18e16552006-02-15 17:27:45 +00006932static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933unicode_length(PyUnicodeObject *self)
6934{
6935 return self->length;
6936}
6937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006938PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006939 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006941Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006942done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943
6944static PyObject *
6945unicode_ljust(PyUnicodeObject *self, PyObject *args)
6946{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006947 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006948 Py_UNICODE fillchar = ' ';
6949
Martin v. Löwis412fb672006-04-13 06:34:32 +00006950 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 return NULL;
6952
Tim Peters7a29bd52001-09-12 03:03:31 +00006953 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 Py_INCREF(self);
6955 return (PyObject*) self;
6956 }
6957
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006958 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959}
6960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006961PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006962 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006964Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965
6966static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006967unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969 return fixup(self, fixlower);
6970}
6971
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006972#define LEFTSTRIP 0
6973#define RIGHTSTRIP 1
6974#define BOTHSTRIP 2
6975
6976/* Arrays indexed by above */
6977static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6978
6979#define STRIPNAME(i) (stripformat[i]+3)
6980
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006981/* externally visible for str.strip(unicode) */
6982PyObject *
6983_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6984{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006985 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6986 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6987 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6988 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6989 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006990
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006991 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006992
Benjamin Peterson857ce152009-01-31 16:29:18 +00006993 i = 0;
6994 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006995 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6996 i++;
6997 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006998 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006999
Benjamin Peterson857ce152009-01-31 16:29:18 +00007000 j = len;
7001 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007002 do {
7003 j--;
7004 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7005 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007006 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007007
Benjamin Peterson857ce152009-01-31 16:29:18 +00007008 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007009 Py_INCREF(self);
7010 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007011 }
7012 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007013 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007014}
7015
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016
7017static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007018do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007020 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7021 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007022
Benjamin Peterson857ce152009-01-31 16:29:18 +00007023 i = 0;
7024 if (striptype != RIGHTSTRIP) {
7025 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7026 i++;
7027 }
7028 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007029
Benjamin Peterson857ce152009-01-31 16:29:18 +00007030 j = len;
7031 if (striptype != LEFTSTRIP) {
7032 do {
7033 j--;
7034 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7035 j++;
7036 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007037
Benjamin Peterson857ce152009-01-31 16:29:18 +00007038 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7039 Py_INCREF(self);
7040 return (PyObject*)self;
7041 }
7042 else
7043 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044}
7045
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007046
7047static PyObject *
7048do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7049{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007050 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007051
Benjamin Peterson857ce152009-01-31 16:29:18 +00007052 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7053 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007054
Benjamin Peterson857ce152009-01-31 16:29:18 +00007055 if (sep != NULL && sep != Py_None) {
7056 if (PyUnicode_Check(sep))
7057 return _PyUnicode_XStrip(self, striptype, sep);
7058 else if (PyString_Check(sep)) {
7059 PyObject *res;
7060 sep = PyUnicode_FromObject(sep);
7061 if (sep==NULL)
7062 return NULL;
7063 res = _PyUnicode_XStrip(self, striptype, sep);
7064 Py_DECREF(sep);
7065 return res;
7066 }
7067 else {
7068 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007069 "%s arg must be None, unicode or str",
7070 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007071 return NULL;
7072 }
7073 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007074
Benjamin Peterson857ce152009-01-31 16:29:18 +00007075 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007076}
7077
7078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007079PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007080 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007081\n\
7082Return a copy of the string S with leading and trailing\n\
7083whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007084If chars is given and not None, remove characters in chars instead.\n\
7085If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086
7087static PyObject *
7088unicode_strip(PyUnicodeObject *self, PyObject *args)
7089{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007090 if (PyTuple_GET_SIZE(args) == 0)
7091 return do_strip(self, BOTHSTRIP); /* Common case */
7092 else
7093 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007094}
7095
7096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007097PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007098 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007099\n\
7100Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007101If chars is given and not None, remove characters in chars instead.\n\
7102If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007103
7104static PyObject *
7105unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7106{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007107 if (PyTuple_GET_SIZE(args) == 0)
7108 return do_strip(self, LEFTSTRIP); /* Common case */
7109 else
7110 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007111}
7112
7113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007114PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007115 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007116\n\
7117Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007118If chars is given and not None, remove characters in chars instead.\n\
7119If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007120
7121static PyObject *
7122unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7123{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007124 if (PyTuple_GET_SIZE(args) == 0)
7125 return do_strip(self, RIGHTSTRIP); /* Common case */
7126 else
7127 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007128}
7129
7130
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007132unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133{
7134 PyUnicodeObject *u;
7135 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007136 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007137 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
7139 if (len < 0)
7140 len = 0;
7141
Tim Peters7a29bd52001-09-12 03:03:31 +00007142 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 /* no repeat, return original string */
7144 Py_INCREF(str);
7145 return (PyObject*) str;
7146 }
Tim Peters8f422462000-09-09 06:13:41 +00007147
7148 /* ensure # of chars needed doesn't overflow int and # of bytes
7149 * needed doesn't overflow size_t
7150 */
7151 nchars = len * str->length;
7152 if (len && nchars / len != str->length) {
7153 PyErr_SetString(PyExc_OverflowError,
7154 "repeated string is too long");
7155 return NULL;
7156 }
7157 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7158 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7159 PyErr_SetString(PyExc_OverflowError,
7160 "repeated string is too long");
7161 return NULL;
7162 }
7163 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 if (!u)
7165 return NULL;
7166
7167 p = u->str;
7168
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007169 if (str->length == 1 && len > 0) {
7170 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007171 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007172 Py_ssize_t done = 0; /* number of characters copied this far */
7173 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007174 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007175 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007176 }
7177 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007178 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007179 Py_UNICODE_COPY(p+done, p, n);
7180 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007181 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183
7184 return (PyObject*) u;
7185}
7186
7187PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007188 PyObject *subobj,
7189 PyObject *replobj,
7190 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191{
7192 PyObject *self;
7193 PyObject *str1;
7194 PyObject *str2;
7195 PyObject *result;
7196
7197 self = PyUnicode_FromObject(obj);
7198 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007199 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 str1 = PyUnicode_FromObject(subobj);
7201 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007202 Py_DECREF(self);
7203 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 }
7205 str2 = PyUnicode_FromObject(replobj);
7206 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007207 Py_DECREF(self);
7208 Py_DECREF(str1);
7209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 }
Tim Petersced69f82003-09-16 20:30:58 +00007211 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007212 (PyUnicodeObject *)str1,
7213 (PyUnicodeObject *)str2,
7214 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 Py_DECREF(self);
7216 Py_DECREF(str1);
7217 Py_DECREF(str2);
7218 return result;
7219}
7220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007221PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007222 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223\n\
7224Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007225old replaced by new. If the optional argument count is\n\
7226given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
7228static PyObject*
7229unicode_replace(PyUnicodeObject *self, PyObject *args)
7230{
7231 PyUnicodeObject *str1;
7232 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007233 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234 PyObject *result;
7235
Martin v. Löwis18e16552006-02-15 17:27:45 +00007236 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237 return NULL;
7238 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7239 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007242 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007243 Py_DECREF(str1);
7244 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246
7247 result = replace(self, str1, str2, maxcount);
7248
7249 Py_DECREF(str1);
7250 Py_DECREF(str2);
7251 return result;
7252}
7253
7254static
7255PyObject *unicode_repr(PyObject *unicode)
7256{
7257 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007258 PyUnicode_GET_SIZE(unicode),
7259 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260}
7261
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007262PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007263 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264\n\
7265Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007266such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267arguments start and end are interpreted as in slice notation.\n\
7268\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007269Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270
7271static PyObject *
7272unicode_rfind(PyUnicodeObject *self, PyObject *args)
7273{
Jesus Cea44e81682011-04-20 16:39:15 +02007274 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007275 Py_ssize_t start;
7276 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007277 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278
Jesus Cea44e81682011-04-20 16:39:15 +02007279 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7280 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007283 result = stringlib_rfind_slice(
7284 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7285 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7286 start, end
7287 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288
7289 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007290
7291 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292}
7293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007294PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007295 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007297Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298
7299static PyObject *
7300unicode_rindex(PyUnicodeObject *self, PyObject *args)
7301{
Jesus Cea44e81682011-04-20 16:39:15 +02007302 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007303 Py_ssize_t start;
7304 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007305 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306
Jesus Cea44e81682011-04-20 16:39:15 +02007307 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7308 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007309 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007311 result = stringlib_rfind_slice(
7312 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7313 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7314 start, end
7315 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316
7317 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007318
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319 if (result < 0) {
7320 PyErr_SetString(PyExc_ValueError, "substring not found");
7321 return NULL;
7322 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007323 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324}
7325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007326PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007327 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007329Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007330done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331
7332static PyObject *
7333unicode_rjust(PyUnicodeObject *self, PyObject *args)
7334{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007335 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007336 Py_UNICODE fillchar = ' ';
7337
Martin v. Löwis412fb672006-04-13 06:34:32 +00007338 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 return NULL;
7340
Tim Peters7a29bd52001-09-12 03:03:31 +00007341 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 Py_INCREF(self);
7343 return (PyObject*) self;
7344 }
7345
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007346 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347}
7348
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007350unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351{
7352 /* standard clamping */
7353 if (start < 0)
7354 start = 0;
7355 if (end < 0)
7356 end = 0;
7357 if (end > self->length)
7358 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007359 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 /* full slice, return original string */
7361 Py_INCREF(self);
7362 return (PyObject*) self;
7363 }
7364 if (start > end)
7365 start = end;
7366 /* copy slice */
7367 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007368 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369}
7370
7371PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007372 PyObject *sep,
7373 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374{
7375 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007376
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377 s = PyUnicode_FromObject(s);
7378 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007379 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007380 if (sep != NULL) {
7381 sep = PyUnicode_FromObject(sep);
7382 if (sep == NULL) {
7383 Py_DECREF(s);
7384 return NULL;
7385 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 }
7387
7388 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7389
7390 Py_DECREF(s);
7391 Py_XDECREF(sep);
7392 return result;
7393}
7394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007395PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007396 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397\n\
7398Return a list of the words in S, using sep as the\n\
7399delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007400splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007401whitespace string is a separator and empty strings are\n\
7402removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403
7404static PyObject*
7405unicode_split(PyUnicodeObject *self, PyObject *args)
7406{
7407 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007408 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409
Martin v. Löwis18e16552006-02-15 17:27:45 +00007410 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 return NULL;
7412
7413 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007414 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007416 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007418 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419}
7420
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007421PyObject *
7422PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7423{
7424 PyObject* str_obj;
7425 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007426 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007427
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007428 str_obj = PyUnicode_FromObject(str_in);
7429 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007430 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007431 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007432 if (!sep_obj) {
7433 Py_DECREF(str_obj);
7434 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007435 }
7436
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007437 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007438 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7439 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7440 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007441
Fredrik Lundhb9479482006-05-26 17:22:38 +00007442 Py_DECREF(sep_obj);
7443 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007444
7445 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007446}
7447
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007448
7449PyObject *
7450PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7451{
7452 PyObject* str_obj;
7453 PyObject* sep_obj;
7454 PyObject* out;
7455
7456 str_obj = PyUnicode_FromObject(str_in);
7457 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007458 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007459 sep_obj = PyUnicode_FromObject(sep_in);
7460 if (!sep_obj) {
7461 Py_DECREF(str_obj);
7462 return NULL;
7463 }
7464
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007465 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007466 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7467 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7468 );
7469
7470 Py_DECREF(sep_obj);
7471 Py_DECREF(str_obj);
7472
7473 return out;
7474}
7475
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007476PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007477 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007478\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007479Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007480the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007481found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007482
7483static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007484unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007485{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007486 return PyUnicode_Partition((PyObject *)self, separator);
7487}
7488
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007489PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007490 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007491\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007492Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007493the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007494separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007495
7496static PyObject*
7497unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7498{
7499 return PyUnicode_RPartition((PyObject *)self, separator);
7500}
7501
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007502PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007503 PyObject *sep,
7504 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007505{
7506 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007507
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007508 s = PyUnicode_FromObject(s);
7509 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007510 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007511 if (sep != NULL) {
7512 sep = PyUnicode_FromObject(sep);
7513 if (sep == NULL) {
7514 Py_DECREF(s);
7515 return NULL;
7516 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007517 }
7518
7519 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7520
7521 Py_DECREF(s);
7522 Py_XDECREF(sep);
7523 return result;
7524}
7525
7526PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007527 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007528\n\
7529Return a list of the words in S, using sep as the\n\
7530delimiter string, starting at the end of the string and\n\
7531working to the front. If maxsplit is given, at most maxsplit\n\
7532splits are done. If sep is not specified, any whitespace string\n\
7533is a separator.");
7534
7535static PyObject*
7536unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7537{
7538 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007539 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007540
Martin v. Löwis18e16552006-02-15 17:27:45 +00007541 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007542 return NULL;
7543
7544 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007545 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007546 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007547 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007548 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007549 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007550}
7551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007552PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007553 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554\n\
7555Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007556Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007557is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558
7559static PyObject*
7560unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7561{
Guido van Rossum86662912000-04-11 15:38:46 +00007562 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563
Guido van Rossum86662912000-04-11 15:38:46 +00007564 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 return NULL;
7566
Guido van Rossum86662912000-04-11 15:38:46 +00007567 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568}
7569
7570static
7571PyObject *unicode_str(PyUnicodeObject *self)
7572{
Fred Drakee4315f52000-05-09 19:53:39 +00007573 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574}
7575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007576PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007577 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578\n\
7579Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007580and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581
7582static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007583unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585 return fixup(self, fixswapcase);
7586}
7587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007588PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007589 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590\n\
7591Return a copy of the string S, where all characters have been mapped\n\
7592through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007593Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7594Unmapped characters are left untouched. Characters mapped to None\n\
7595are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596
7597static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007598unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599{
Tim Petersced69f82003-09-16 20:30:58 +00007600 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007601 self->length,
7602 table,
7603 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604}
7605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007606PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007607 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007609Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610
7611static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007612unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614 return fixup(self, fixupper);
7615}
7616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007617PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007618 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619\n\
Georg Brandl98064072008-09-09 19:26:00 +00007620Pad a numeric string S with zeros on the left, to fill a field\n\
7621of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622
7623static PyObject *
7624unicode_zfill(PyUnicodeObject *self, PyObject *args)
7625{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007626 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627 PyUnicodeObject *u;
7628
Martin v. Löwis18e16552006-02-15 17:27:45 +00007629 Py_ssize_t width;
7630 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631 return NULL;
7632
7633 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007634 if (PyUnicode_CheckExact(self)) {
7635 Py_INCREF(self);
7636 return (PyObject*) self;
7637 }
7638 else
7639 return PyUnicode_FromUnicode(
7640 PyUnicode_AS_UNICODE(self),
7641 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007642 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643 }
7644
7645 fill = width - self->length;
7646
7647 u = pad(self, fill, 0, '0');
7648
Walter Dörwald068325e2002-04-15 13:36:47 +00007649 if (u == NULL)
7650 return NULL;
7651
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652 if (u->str[fill] == '+' || u->str[fill] == '-') {
7653 /* move sign to beginning of string */
7654 u->str[0] = u->str[fill];
7655 u->str[fill] = '0';
7656 }
7657
7658 return (PyObject*) u;
7659}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
7661#if 0
7662static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007663free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007665 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666}
7667#endif
7668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007669PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007670 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007672Return True if S starts with the specified prefix, False otherwise.\n\
7673With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007674With optional end, stop comparing S at that position.\n\
7675prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676
7677static PyObject *
7678unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007679 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680{
Georg Brandl24250812006-06-09 18:45:48 +00007681 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007683 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007684 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007685 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686
Jesus Cea44e81682011-04-20 16:39:15 +02007687 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007688 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007689 if (PyTuple_Check(subobj)) {
7690 Py_ssize_t i;
7691 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7692 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007693 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007694 if (substring == NULL)
7695 return NULL;
7696 result = tailmatch(self, substring, start, end, -1);
7697 Py_DECREF(substring);
7698 if (result) {
7699 Py_RETURN_TRUE;
7700 }
7701 }
7702 /* nothing matched */
7703 Py_RETURN_FALSE;
7704 }
7705 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007706 if (substring == NULL) {
7707 if (PyErr_ExceptionMatches(PyExc_TypeError))
7708 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7709 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007710 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007711 }
Georg Brandl24250812006-06-09 18:45:48 +00007712 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007714 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715}
7716
7717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007718PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007719 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007721Return True if S ends with the specified suffix, False otherwise.\n\
7722With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007723With optional end, stop comparing S at that position.\n\
7724suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725
7726static PyObject *
7727unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007728 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729{
Georg Brandl24250812006-06-09 18:45:48 +00007730 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007732 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007733 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007734 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735
Jesus Cea44e81682011-04-20 16:39:15 +02007736 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007737 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007738 if (PyTuple_Check(subobj)) {
7739 Py_ssize_t i;
7740 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7741 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007742 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007743 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007744 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007745 result = tailmatch(self, substring, start, end, +1);
7746 Py_DECREF(substring);
7747 if (result) {
7748 Py_RETURN_TRUE;
7749 }
7750 }
7751 Py_RETURN_FALSE;
7752 }
7753 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007754 if (substring == NULL) {
7755 if (PyErr_ExceptionMatches(PyExc_TypeError))
7756 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7757 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007758 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007759 }
Georg Brandl24250812006-06-09 18:45:48 +00007760 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007762 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763}
7764
7765
Eric Smitha9f7d622008-02-17 19:46:49 +00007766/* Implements do_string_format, which is unicode because of stringlib */
7767#include "stringlib/string_format.h"
7768
7769PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007770 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007771\n\
Eric Smith6c840852010-11-06 19:43:44 +00007772Return a formatted version of S, using substitutions from args and kwargs.\n\
7773The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007774
Eric Smithdc13b792008-05-30 18:10:04 +00007775static PyObject *
7776unicode__format__(PyObject *self, PyObject *args)
7777{
7778 PyObject *format_spec;
7779 PyObject *result = NULL;
7780 PyObject *tmp = NULL;
7781
7782 /* If 2.x, convert format_spec to the same type as value */
7783 /* This is to allow things like u''.format('') */
7784 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7785 goto done;
7786 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7787 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007788 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007789 goto done;
7790 }
7791 tmp = PyObject_Unicode(format_spec);
7792 if (tmp == NULL)
7793 goto done;
7794 format_spec = tmp;
7795
7796 result = _PyUnicode_FormatAdvanced(self,
7797 PyUnicode_AS_UNICODE(format_spec),
7798 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007799 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007800 Py_XDECREF(tmp);
7801 return result;
7802}
7803
Eric Smitha9f7d622008-02-17 19:46:49 +00007804PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007805 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007806\n\
Eric Smith6c840852010-11-06 19:43:44 +00007807Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007808
Robert Schuppenies901c9972008-06-10 10:10:31 +00007809static PyObject *
7810unicode__sizeof__(PyUnicodeObject *v)
7811{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007812 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7813 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007814}
7815
7816PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007817 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007818\n\
7819");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007820
7821static PyObject *
7822unicode_getnewargs(PyUnicodeObject *v)
7823{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007824 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007825}
7826
7827
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007829 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007830 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7831 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007832 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007833 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7834 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7835 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7836 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7837 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7838 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7839 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007840 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007841 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7842 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7843 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007844 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007845 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007846/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7847 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7848 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7849 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007850 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007851 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007852 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007853 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007854 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7855 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7856 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7857 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7858 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7859 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7860 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7861 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7862 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7863 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7864 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7865 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7866 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7867 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007868 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007869 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7870 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7871 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7872 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007873 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007874#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007875 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876#endif
7877
7878#if 0
7879 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007880 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881#endif
7882
Benjamin Peterson857ce152009-01-31 16:29:18 +00007883 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 {NULL, NULL}
7885};
7886
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007887static PyObject *
7888unicode_mod(PyObject *v, PyObject *w)
7889{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007890 if (!PyUnicode_Check(v)) {
7891 Py_INCREF(Py_NotImplemented);
7892 return Py_NotImplemented;
7893 }
7894 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007895}
7896
7897static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007898 0, /*nb_add*/
7899 0, /*nb_subtract*/
7900 0, /*nb_multiply*/
7901 0, /*nb_divide*/
7902 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007903};
7904
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007906 (lenfunc) unicode_length, /* sq_length */
7907 PyUnicode_Concat, /* sq_concat */
7908 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7909 (ssizeargfunc) unicode_getitem, /* sq_item */
7910 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7911 0, /* sq_ass_item */
7912 0, /* sq_ass_slice */
7913 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914};
7915
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007916static PyObject*
7917unicode_subscript(PyUnicodeObject* self, PyObject* item)
7918{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007919 if (PyIndex_Check(item)) {
7920 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007921 if (i == -1 && PyErr_Occurred())
7922 return NULL;
7923 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007924 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007925 return unicode_getitem(self, i);
7926 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007927 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007928 Py_UNICODE* source_buf;
7929 Py_UNICODE* result_buf;
7930 PyObject* result;
7931
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007932 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007933 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007934 return NULL;
7935 }
7936
7937 if (slicelength <= 0) {
7938 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007939 } else if (start == 0 && step == 1 && slicelength == self->length &&
7940 PyUnicode_CheckExact(self)) {
7941 Py_INCREF(self);
7942 return (PyObject *)self;
7943 } else if (step == 1) {
7944 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007945 } else {
7946 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007947 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7948 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007949
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007950 if (result_buf == NULL)
7951 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007952
7953 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7954 result_buf[i] = source_buf[cur];
7955 }
Tim Petersced69f82003-09-16 20:30:58 +00007956
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007957 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007958 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007959 return result;
7960 }
7961 } else {
7962 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7963 return NULL;
7964 }
7965}
7966
7967static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007968 (lenfunc)unicode_length, /* mp_length */
7969 (binaryfunc)unicode_subscript, /* mp_subscript */
7970 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007971};
7972
Martin v. Löwis18e16552006-02-15 17:27:45 +00007973static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007975 Py_ssize_t index,
7976 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977{
7978 if (index != 0) {
7979 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007980 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 return -1;
7982 }
7983 *ptr = (void *) self->str;
7984 return PyUnicode_GET_DATA_SIZE(self);
7985}
7986
Martin v. Löwis18e16552006-02-15 17:27:45 +00007987static Py_ssize_t
7988unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007989 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990{
7991 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007992 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 return -1;
7994}
7995
7996static int
7997unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007998 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999{
8000 if (lenp)
8001 *lenp = PyUnicode_GET_DATA_SIZE(self);
8002 return 1;
8003}
8004
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008005static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008007 Py_ssize_t index,
8008 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009{
8010 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008011
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 if (index != 0) {
8013 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008014 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 return -1;
8016 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008017 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008019 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008020 *ptr = (void *) PyString_AS_STRING(str);
8021 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022}
8023
8024/* Helpers for PyUnicode_Format() */
8025
8026static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008027getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008029 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008031 (*p_argidx)++;
8032 if (arglen < 0)
8033 return args;
8034 else
8035 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 }
8037 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008038 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 return NULL;
8040}
8041
8042#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008043#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008045#define F_ALT (1<<3)
8046#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047
Martin v. Löwis18e16552006-02-15 17:27:45 +00008048static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008049strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008051 register Py_ssize_t i;
8052 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008054 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056 return len;
8057}
8058
Neal Norwitzfc76d632006-01-10 06:03:13 +00008059static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008060longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8061{
Tim Peters15231542006-02-16 01:08:01 +00008062 Py_ssize_t result;
8063
Neal Norwitzfc76d632006-01-10 06:03:13 +00008064 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008065 result = strtounicode(buffer, (char *)buffer);
8066 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008067}
8068
Guido van Rossum078151d2002-08-11 04:24:12 +00008069/* XXX To save some code duplication, formatfloat/long/int could have been
8070 shared with stringobject.c, converting from 8-bit to Unicode after the
8071 formatting is done. */
8072
Mark Dickinson18cfada2009-11-23 18:46:41 +00008073/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8074
8075static PyObject *
8076formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008078 char *p;
8079 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008081
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 x = PyFloat_AsDouble(v);
8083 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008084 return NULL;
8085
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008087 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008088
Mark Dickinson18cfada2009-11-23 18:46:41 +00008089 p = PyOS_double_to_string(x, type, prec,
8090 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8091 if (p == NULL)
8092 return NULL;
8093 result = PyUnicode_FromStringAndSize(p, strlen(p));
8094 PyMem_Free(p);
8095 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096}
8097
Tim Peters38fd5b62000-09-21 05:43:11 +00008098static PyObject*
8099formatlong(PyObject *val, int flags, int prec, int type)
8100{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008101 char *buf;
8102 int i, len;
8103 PyObject *str; /* temporary string object. */
8104 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008105
Benjamin Peterson857ce152009-01-31 16:29:18 +00008106 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8107 if (!str)
8108 return NULL;
8109 result = _PyUnicode_New(len);
8110 if (!result) {
8111 Py_DECREF(str);
8112 return NULL;
8113 }
8114 for (i = 0; i < len; i++)
8115 result->str[i] = buf[i];
8116 result->str[len] = 0;
8117 Py_DECREF(str);
8118 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008119}
8120
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121static int
8122formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008123 size_t buflen,
8124 int flags,
8125 int prec,
8126 int type,
8127 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008129 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008130 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8131 * + 1 + 1
8132 * = 24
8133 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008134 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008135 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136 long x;
8137
8138 x = PyInt_AsLong(v);
8139 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008140 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008141 if (x < 0 && type == 'u') {
8142 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008143 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008144 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8145 sign = "-";
8146 else
8147 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008149 prec = 1;
8150
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008151 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8152 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008153 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008154 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008155 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008156 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008157 return -1;
8158 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008159
8160 if ((flags & F_ALT) &&
8161 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008162 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008163 * of issues that cause pain:
8164 * - when 0 is being converted, the C standard leaves off
8165 * the '0x' or '0X', which is inconsistent with other
8166 * %#x/%#X conversions and inconsistent with Python's
8167 * hex() function
8168 * - there are platforms that violate the standard and
8169 * convert 0 with the '0x' or '0X'
8170 * (Metrowerks, Compaq Tru64)
8171 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008172 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008173 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008174 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008175 * We can achieve the desired consistency by inserting our
8176 * own '0x' or '0X' prefix, and substituting %x/%X in place
8177 * of %#x/%#X.
8178 *
8179 * Note that this is the same approach as used in
8180 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008181 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008182 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8183 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008184 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008185 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008186 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8187 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008188 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008189 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008190 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008191 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008192 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008193 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194}
8195
8196static int
8197formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008198 size_t buflen,
8199 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200{
Ezio Melotti32125152010-02-25 17:36:04 +00008201 PyObject *unistr;
8202 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008203 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008204 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008205 if (PyUnicode_GET_SIZE(v) != 1)
8206 goto onError;
8207 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008210 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008211 if (PyString_GET_SIZE(v) != 1)
8212 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008213 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8214 with a UnicodeDecodeError if 'char' is not decodable with the
8215 default encoding (usually ASCII, but it might be something else) */
8216 str = PyString_AS_STRING(v);
8217 if ((unsigned char)str[0] > 0x7F) {
8218 /* the char is not ASCII; try to decode the string using the
8219 default encoding and return -1 to let the UnicodeDecodeError
8220 be raised if the string can't be decoded */
8221 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8222 if (unistr == NULL)
8223 return -1;
8224 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8225 Py_DECREF(unistr);
8226 }
8227 else
8228 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230
8231 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008232 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008234 x = PyInt_AsLong(v);
8235 if (x == -1 && PyErr_Occurred())
8236 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008237#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008238 if (x < 0 || x > 0x10ffff) {
8239 PyErr_SetString(PyExc_OverflowError,
8240 "%c arg not in range(0x110000) "
8241 "(wide Python build)");
8242 return -1;
8243 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008244#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008245 if (x < 0 || x > 0xffff) {
8246 PyErr_SetString(PyExc_OverflowError,
8247 "%c arg not in range(0x10000) "
8248 "(narrow Python build)");
8249 return -1;
8250 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008251#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008252 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253 }
8254 buf[1] = '\0';
8255 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008256
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008257 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008258 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008259 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008260 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261}
8262
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008263/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8264
Mark Dickinson18cfada2009-11-23 18:46:41 +00008265 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008266 chars are formatted. XXX This is a magic number. Each formatting
8267 routine does bounds checking to ensure no overflow, but a better
8268 solution may be to malloc a buffer of appropriate size for each
8269 format. For now, the current solution is sufficient.
8270*/
8271#define FORMATBUFLEN (size_t)120
8272
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008274 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275{
8276 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008277 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 int args_owned = 0;
8279 PyUnicodeObject *result = NULL;
8280 PyObject *dict = NULL;
8281 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008282
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008284 PyErr_BadInternalCall();
8285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 }
8287 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008288 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 fmt = PyUnicode_AS_UNICODE(uformat);
8291 fmtcnt = PyUnicode_GET_SIZE(uformat);
8292
8293 reslen = rescnt = fmtcnt + 100;
8294 result = _PyUnicode_New(reslen);
8295 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008296 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 res = PyUnicode_AS_UNICODE(result);
8298
8299 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008300 arglen = PyTuple_Size(args);
8301 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 }
8303 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008304 arglen = -1;
8305 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 }
Benjamin Petersonda2c7eb2013-03-23 22:32:00 -05008307 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8308 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008309 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310
8311 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008312 if (*fmt != '%') {
8313 if (--rescnt < 0) {
8314 rescnt = fmtcnt + 100;
8315 reslen += rescnt;
8316 if (_PyUnicode_Resize(&result, reslen) < 0)
8317 goto onError;
8318 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8319 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008320 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008321 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008322 }
8323 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008324 /* Got a format specifier */
8325 int flags = 0;
8326 Py_ssize_t width = -1;
8327 int prec = -1;
8328 Py_UNICODE c = '\0';
8329 Py_UNICODE fill;
8330 int isnumok;
8331 PyObject *v = NULL;
8332 PyObject *temp = NULL;
8333 Py_UNICODE *pbuf;
8334 Py_UNICODE sign;
8335 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008336 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008337
8338 fmt++;
8339 if (*fmt == '(') {
8340 Py_UNICODE *keystart;
8341 Py_ssize_t keylen;
8342 PyObject *key;
8343 int pcount = 1;
8344
8345 if (dict == NULL) {
8346 PyErr_SetString(PyExc_TypeError,
8347 "format requires a mapping");
8348 goto onError;
8349 }
8350 ++fmt;
8351 --fmtcnt;
8352 keystart = fmt;
8353 /* Skip over balanced parentheses */
8354 while (pcount > 0 && --fmtcnt >= 0) {
8355 if (*fmt == ')')
8356 --pcount;
8357 else if (*fmt == '(')
8358 ++pcount;
8359 fmt++;
8360 }
8361 keylen = fmt - keystart - 1;
8362 if (fmtcnt < 0 || pcount > 0) {
8363 PyErr_SetString(PyExc_ValueError,
8364 "incomplete format key");
8365 goto onError;
8366 }
8367#if 0
8368 /* keys are converted to strings using UTF-8 and
8369 then looked up since Python uses strings to hold
8370 variables names etc. in its namespaces and we
8371 wouldn't want to break common idioms. */
8372 key = PyUnicode_EncodeUTF8(keystart,
8373 keylen,
8374 NULL);
8375#else
8376 key = PyUnicode_FromUnicode(keystart, keylen);
8377#endif
8378 if (key == NULL)
8379 goto onError;
8380 if (args_owned) {
8381 Py_DECREF(args);
8382 args_owned = 0;
8383 }
8384 args = PyObject_GetItem(dict, key);
8385 Py_DECREF(key);
8386 if (args == NULL) {
8387 goto onError;
8388 }
8389 args_owned = 1;
8390 arglen = -1;
8391 argidx = -2;
8392 }
8393 while (--fmtcnt >= 0) {
8394 switch (c = *fmt++) {
8395 case '-': flags |= F_LJUST; continue;
8396 case '+': flags |= F_SIGN; continue;
8397 case ' ': flags |= F_BLANK; continue;
8398 case '#': flags |= F_ALT; continue;
8399 case '0': flags |= F_ZERO; continue;
8400 }
8401 break;
8402 }
8403 if (c == '*') {
8404 v = getnextarg(args, arglen, &argidx);
8405 if (v == NULL)
8406 goto onError;
8407 if (!PyInt_Check(v)) {
8408 PyErr_SetString(PyExc_TypeError,
8409 "* wants int");
8410 goto onError;
8411 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008412 width = PyInt_AsSsize_t(v);
8413 if (width == -1 && PyErr_Occurred())
8414 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008415 if (width < 0) {
8416 flags |= F_LJUST;
8417 width = -width;
8418 }
8419 if (--fmtcnt >= 0)
8420 c = *fmt++;
8421 }
8422 else if (c >= '0' && c <= '9') {
8423 width = c - '0';
8424 while (--fmtcnt >= 0) {
8425 c = *fmt++;
8426 if (c < '0' || c > '9')
8427 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008428 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008429 PyErr_SetString(PyExc_ValueError,
8430 "width too big");
8431 goto onError;
8432 }
8433 width = width*10 + (c - '0');
8434 }
8435 }
8436 if (c == '.') {
8437 prec = 0;
8438 if (--fmtcnt >= 0)
8439 c = *fmt++;
8440 if (c == '*') {
8441 v = getnextarg(args, arglen, &argidx);
8442 if (v == NULL)
8443 goto onError;
8444 if (!PyInt_Check(v)) {
8445 PyErr_SetString(PyExc_TypeError,
8446 "* wants int");
8447 goto onError;
8448 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008449 prec = _PyInt_AsInt(v);
8450 if (prec == -1 && PyErr_Occurred())
8451 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008452 if (prec < 0)
8453 prec = 0;
8454 if (--fmtcnt >= 0)
8455 c = *fmt++;
8456 }
8457 else if (c >= '0' && c <= '9') {
8458 prec = c - '0';
8459 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008460 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008461 if (c < '0' || c > '9')
8462 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008463 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008464 PyErr_SetString(PyExc_ValueError,
8465 "prec too big");
8466 goto onError;
8467 }
8468 prec = prec*10 + (c - '0');
8469 }
8470 }
8471 } /* prec */
8472 if (fmtcnt >= 0) {
8473 if (c == 'h' || c == 'l' || c == 'L') {
8474 if (--fmtcnt >= 0)
8475 c = *fmt++;
8476 }
8477 }
8478 if (fmtcnt < 0) {
8479 PyErr_SetString(PyExc_ValueError,
8480 "incomplete format");
8481 goto onError;
8482 }
8483 if (c != '%') {
8484 v = getnextarg(args, arglen, &argidx);
8485 if (v == NULL)
8486 goto onError;
8487 }
8488 sign = 0;
8489 fill = ' ';
8490 switch (c) {
8491
8492 case '%':
8493 pbuf = formatbuf;
8494 /* presume that buffer length is at least 1 */
8495 pbuf[0] = '%';
8496 len = 1;
8497 break;
8498
8499 case 's':
8500 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008501 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008502 temp = v;
8503 Py_INCREF(temp);
8504 }
8505 else {
8506 PyObject *unicode;
8507 if (c == 's')
8508 temp = PyObject_Unicode(v);
8509 else
8510 temp = PyObject_Repr(v);
8511 if (temp == NULL)
8512 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008513 if (PyUnicode_Check(temp))
8514 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008515 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008516 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008517 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8518 PyString_GET_SIZE(temp),
8519 NULL,
8520 "strict");
8521 Py_DECREF(temp);
8522 temp = unicode;
8523 if (temp == NULL)
8524 goto onError;
8525 }
8526 else {
8527 Py_DECREF(temp);
8528 PyErr_SetString(PyExc_TypeError,
8529 "%s argument has non-string str()");
8530 goto onError;
8531 }
8532 }
8533 pbuf = PyUnicode_AS_UNICODE(temp);
8534 len = PyUnicode_GET_SIZE(temp);
8535 if (prec >= 0 && len > prec)
8536 len = prec;
8537 break;
8538
8539 case 'i':
8540 case 'd':
8541 case 'u':
8542 case 'o':
8543 case 'x':
8544 case 'X':
8545 if (c == 'i')
8546 c = 'd';
8547 isnumok = 0;
8548 if (PyNumber_Check(v)) {
8549 PyObject *iobj=NULL;
8550
8551 if (PyInt_Check(v) || (PyLong_Check(v))) {
8552 iobj = v;
8553 Py_INCREF(iobj);
8554 }
8555 else {
8556 iobj = PyNumber_Int(v);
8557 if (iobj==NULL) iobj = PyNumber_Long(v);
8558 }
8559 if (iobj!=NULL) {
8560 if (PyInt_Check(iobj)) {
8561 isnumok = 1;
8562 pbuf = formatbuf;
8563 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8564 flags, prec, c, iobj);
8565 Py_DECREF(iobj);
8566 if (len < 0)
8567 goto onError;
8568 sign = 1;
8569 }
8570 else if (PyLong_Check(iobj)) {
8571 isnumok = 1;
8572 temp = formatlong(iobj, flags, prec, c);
8573 Py_DECREF(iobj);
8574 if (!temp)
8575 goto onError;
8576 pbuf = PyUnicode_AS_UNICODE(temp);
8577 len = PyUnicode_GET_SIZE(temp);
8578 sign = 1;
8579 }
8580 else {
8581 Py_DECREF(iobj);
8582 }
8583 }
8584 }
8585 if (!isnumok) {
8586 PyErr_Format(PyExc_TypeError,
8587 "%%%c format: a number is required, "
8588 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8589 goto onError;
8590 }
8591 if (flags & F_ZERO)
8592 fill = '0';
8593 break;
8594
8595 case 'e':
8596 case 'E':
8597 case 'f':
8598 case 'F':
8599 case 'g':
8600 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008601 temp = formatfloat(v, flags, prec, c);
8602 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008603 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008604 pbuf = PyUnicode_AS_UNICODE(temp);
8605 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008606 sign = 1;
8607 if (flags & F_ZERO)
8608 fill = '0';
8609 break;
8610
8611 case 'c':
8612 pbuf = formatbuf;
8613 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8614 if (len < 0)
8615 goto onError;
8616 break;
8617
8618 default:
8619 PyErr_Format(PyExc_ValueError,
8620 "unsupported format character '%c' (0x%x) "
8621 "at index %zd",
8622 (31<=c && c<=126) ? (char)c : '?',
8623 (int)c,
8624 (Py_ssize_t)(fmt - 1 -
8625 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008626 goto onError;
8627 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008628 if (sign) {
8629 if (*pbuf == '-' || *pbuf == '+') {
8630 sign = *pbuf++;
8631 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008632 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008633 else if (flags & F_SIGN)
8634 sign = '+';
8635 else if (flags & F_BLANK)
8636 sign = ' ';
8637 else
8638 sign = 0;
8639 }
8640 if (width < len)
8641 width = len;
8642 if (rescnt - (sign != 0) < width) {
8643 reslen -= rescnt;
8644 rescnt = width + fmtcnt + 100;
8645 reslen += rescnt;
8646 if (reslen < 0) {
8647 Py_XDECREF(temp);
8648 PyErr_NoMemory();
8649 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008650 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008651 if (_PyUnicode_Resize(&result, reslen) < 0) {
8652 Py_XDECREF(temp);
8653 goto onError;
8654 }
8655 res = PyUnicode_AS_UNICODE(result)
8656 + reslen - rescnt;
8657 }
8658 if (sign) {
8659 if (fill != ' ')
8660 *res++ = sign;
8661 rescnt--;
8662 if (width > len)
8663 width--;
8664 }
8665 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8666 assert(pbuf[0] == '0');
8667 assert(pbuf[1] == c);
8668 if (fill != ' ') {
8669 *res++ = *pbuf++;
8670 *res++ = *pbuf++;
8671 }
8672 rescnt -= 2;
8673 width -= 2;
8674 if (width < 0)
8675 width = 0;
8676 len -= 2;
8677 }
8678 if (width > len && !(flags & F_LJUST)) {
8679 do {
8680 --rescnt;
8681 *res++ = fill;
8682 } while (--width > len);
8683 }
8684 if (fill == ' ') {
8685 if (sign)
8686 *res++ = sign;
8687 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8688 assert(pbuf[0] == '0');
8689 assert(pbuf[1] == c);
8690 *res++ = *pbuf++;
8691 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008692 }
8693 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008694 Py_UNICODE_COPY(res, pbuf, len);
8695 res += len;
8696 rescnt -= len;
8697 while (--width >= len) {
8698 --rescnt;
8699 *res++ = ' ';
8700 }
8701 if (dict && (argidx < arglen) && c != '%') {
8702 PyErr_SetString(PyExc_TypeError,
8703 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008704 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008705 goto onError;
8706 }
8707 Py_XDECREF(temp);
8708 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709 } /* until end */
8710 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008711 PyErr_SetString(PyExc_TypeError,
8712 "not all arguments converted during string formatting");
8713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714 }
8715
Thomas Woutersa96affe2006-03-12 00:29:36 +00008716 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008717 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008719 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 }
8721 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 return (PyObject *)result;
8723
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008724 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725 Py_XDECREF(result);
8726 Py_DECREF(uformat);
8727 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008728 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729 }
8730 return NULL;
8731}
8732
8733static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008734 (readbufferproc) unicode_buffer_getreadbuf,
8735 (writebufferproc) unicode_buffer_getwritebuf,
8736 (segcountproc) unicode_buffer_getsegcount,
8737 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738};
8739
Jeremy Hylton938ace62002-07-17 16:30:39 +00008740static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008741unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8742
Tim Peters6d6c1a32001-08-02 04:15:00 +00008743static PyObject *
8744unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8745{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008746 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008747 static char *kwlist[] = {"string", "encoding", "errors", 0};
8748 char *encoding = NULL;
8749 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008750
Benjamin Peterson857ce152009-01-31 16:29:18 +00008751 if (type != &PyUnicode_Type)
8752 return unicode_subtype_new(type, args, kwds);
8753 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008754 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008755 return NULL;
8756 if (x == NULL)
8757 return (PyObject *)_PyUnicode_New(0);
8758 if (encoding == NULL && errors == NULL)
8759 return PyObject_Unicode(x);
8760 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008761 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008762}
8763
Guido van Rossume023fe02001-08-30 03:12:59 +00008764static PyObject *
8765unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8766{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008767 PyUnicodeObject *tmp, *pnew;
8768 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008769
Benjamin Peterson857ce152009-01-31 16:29:18 +00008770 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8771 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8772 if (tmp == NULL)
8773 return NULL;
8774 assert(PyUnicode_Check(tmp));
8775 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8776 if (pnew == NULL) {
8777 Py_DECREF(tmp);
8778 return NULL;
8779 }
8780 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8781 if (pnew->str == NULL) {
8782 _Py_ForgetReference((PyObject *)pnew);
8783 PyObject_Del(pnew);
8784 Py_DECREF(tmp);
8785 return PyErr_NoMemory();
8786 }
8787 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8788 pnew->length = n;
8789 pnew->hash = tmp->hash;
8790 Py_DECREF(tmp);
8791 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008792}
8793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008794PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008795 "unicode(object='') -> unicode object\n\
8796unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008797\n\
8798Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008799encoding defaults to the current default string encoding.\n\
8800errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008801
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008803 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008804 "unicode", /* tp_name */
8805 sizeof(PyUnicodeObject), /* tp_size */
8806 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008808 (destructor)unicode_dealloc, /* tp_dealloc */
8809 0, /* tp_print */
8810 0, /* tp_getattr */
8811 0, /* tp_setattr */
8812 0, /* tp_compare */
8813 unicode_repr, /* tp_repr */
8814 &unicode_as_number, /* tp_as_number */
8815 &unicode_as_sequence, /* tp_as_sequence */
8816 &unicode_as_mapping, /* tp_as_mapping */
8817 (hashfunc) unicode_hash, /* tp_hash*/
8818 0, /* tp_call*/
8819 (reprfunc) unicode_str, /* tp_str */
8820 PyObject_GenericGetAttr, /* tp_getattro */
8821 0, /* tp_setattro */
8822 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008823 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008824 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008825 unicode_doc, /* tp_doc */
8826 0, /* tp_traverse */
8827 0, /* tp_clear */
8828 PyUnicode_RichCompare, /* tp_richcompare */
8829 0, /* tp_weaklistoffset */
8830 0, /* tp_iter */
8831 0, /* tp_iternext */
8832 unicode_methods, /* tp_methods */
8833 0, /* tp_members */
8834 0, /* tp_getset */
8835 &PyBaseString_Type, /* tp_base */
8836 0, /* tp_dict */
8837 0, /* tp_descr_get */
8838 0, /* tp_descr_set */
8839 0, /* tp_dictoffset */
8840 0, /* tp_init */
8841 0, /* tp_alloc */
8842 unicode_new, /* tp_new */
8843 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844};
8845
8846/* Initialize the Unicode implementation */
8847
Thomas Wouters78890102000-07-22 19:25:51 +00008848void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008850 /* XXX - move this array to unicodectype.c ? */
8851 Py_UNICODE linebreak[] = {
8852 0x000A, /* LINE FEED */
8853 0x000D, /* CARRIAGE RETURN */
8854 0x001C, /* FILE SEPARATOR */
8855 0x001D, /* GROUP SEPARATOR */
8856 0x001E, /* RECORD SEPARATOR */
8857 0x0085, /* NEXT LINE */
8858 0x2028, /* LINE SEPARATOR */
8859 0x2029, /* PARAGRAPH SEPARATOR */
8860 };
8861
Fred Drakee4315f52000-05-09 19:53:39 +00008862 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008863 if (!unicode_empty) {
8864 unicode_empty = _PyUnicode_New(0);
8865 if (!unicode_empty)
8866 return;
8867 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008868
Guido van Rossumcacfc072002-05-24 19:01:59 +00008869 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008870 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008871
8872 /* initialize the linebreak bloom filter */
8873 bloom_linebreak = make_bloom_mask(
8874 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8875 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008876
8877 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008878
8879 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8880 Py_FatalError("Can't initialize field name iterator type");
8881
8882 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8883 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884}
8885
8886/* Finalize the Unicode implementation */
8887
Christian Heimes3b718a72008-02-14 12:47:33 +00008888int
8889PyUnicode_ClearFreeList(void)
8890{
8891 int freelist_size = numfree;
8892 PyUnicodeObject *u;
8893
8894 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008895 PyUnicodeObject *v = u;
8896 u = *(PyUnicodeObject **)u;
8897 if (v->str)
8898 PyObject_DEL(v->str);
8899 Py_XDECREF(v->defenc);
8900 PyObject_Del(v);
8901 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008902 }
8903 free_list = NULL;
8904 assert(numfree == 0);
8905 return freelist_size;
8906}
8907
Guido van Rossumd57fd912000-03-10 22:53:23 +00008908void
Thomas Wouters78890102000-07-22 19:25:51 +00008909_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008911 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008913 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008914
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008915 for (i = 0; i < 256; i++)
8916 Py_CLEAR(unicode_latin1[i]);
8917
Christian Heimes3b718a72008-02-14 12:47:33 +00008918 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008920
Anthony Baxterac6bd462006-04-13 02:06:09 +00008921#ifdef __cplusplus
8922}
8923#endif