blob: d44a298557ea697bfe3a3e28f4e92a196ff4ab86 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550#ifdef HAVE_WCHAR_H
551
Mark Dickinson6b265f12009-03-18 16:07:26 +0000552#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
553# define CONVERT_WCHAR_TO_SURROGATES
554#endif
555
556#ifdef CONVERT_WCHAR_TO_SURROGATES
557
558/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
559 to convert from UTF32 to UTF16. */
560
561PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
562 Py_ssize_t size)
563{
564 PyUnicodeObject *unicode;
565 register Py_ssize_t i;
566 Py_ssize_t alloc;
567 const wchar_t *orig_w;
568
569 if (w == NULL) {
570 PyErr_BadInternalCall();
571 return NULL;
572 }
573
574 alloc = size;
575 orig_w = w;
576 for (i = size; i > 0; i--) {
577 if (*w > 0xFFFF)
578 alloc++;
579 w++;
580 }
581 w = orig_w;
582 unicode = _PyUnicode_New(alloc);
583 if (!unicode)
584 return NULL;
585
586 /* Copy the wchar_t data into the new object */
587 {
588 register Py_UNICODE *u;
589 u = PyUnicode_AS_UNICODE(unicode);
590 for (i = size; i > 0; i--) {
591 if (*w > 0xFFFF) {
592 wchar_t ordinal = *w++;
593 ordinal -= 0x10000;
594 *u++ = 0xD800 | (ordinal >> 10);
595 *u++ = 0xDC00 | (ordinal & 0x3FF);
596 }
597 else
598 *u++ = *w++;
599 }
600 }
601 return (PyObject *)unicode;
602}
603
604#else
605
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000607 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608{
609 PyUnicodeObject *unicode;
610
611 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000612 PyErr_BadInternalCall();
613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 }
615
616 unicode = _PyUnicode_New(size);
617 if (!unicode)
618 return NULL;
619
620 /* Copy the wchar_t data into the new object */
621#ifdef HAVE_USABLE_WCHAR_T
622 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000623#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000625 register Py_UNICODE *u;
626 register Py_ssize_t i;
627 u = PyUnicode_AS_UNICODE(unicode);
628 for (i = size; i > 0; i--)
629 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000630 }
631#endif
632
633 return (PyObject *)unicode;
634}
635
Mark Dickinson6b265f12009-03-18 16:07:26 +0000636#endif /* CONVERT_WCHAR_TO_SURROGATES */
637
638#undef CONVERT_WCHAR_TO_SURROGATES
639
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000640static void
641makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
642{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000643 *fmt++ = '%';
644 if (width) {
645 if (zeropad)
646 *fmt++ = '0';
647 fmt += sprintf(fmt, "%d", width);
648 }
649 if (precision)
650 fmt += sprintf(fmt, ".%d", precision);
651 if (longflag)
652 *fmt++ = 'l';
653 else if (size_tflag) {
654 char *f = PY_FORMAT_SIZE_T;
655 while (*f)
656 *fmt++ = *f++;
657 }
658 *fmt++ = c;
659 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000660}
661
662#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
663
664PyObject *
665PyUnicode_FromFormatV(const char *format, va_list vargs)
666{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000667 va_list count;
668 Py_ssize_t callcount = 0;
669 PyObject **callresults = NULL;
670 PyObject **callresult = NULL;
671 Py_ssize_t n = 0;
672 int width = 0;
673 int precision = 0;
674 int zeropad;
675 const char* f;
676 Py_UNICODE *s;
677 PyObject *string;
678 /* used by sprintf */
679 char buffer[21];
680 /* use abuffer instead of buffer, if we need more space
681 * (which can happen if there's a format specifier with width). */
682 char *abuffer = NULL;
683 char *realbuffer;
684 Py_ssize_t abuffersize = 0;
685 char fmt[60]; /* should be enough for %0width.precisionld */
686 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000687
688#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000689 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000690#else
691#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000692 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000693#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000694 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000695#endif
696#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000697 /* step 1: count the number of %S/%R/%s format specifications
698 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
699 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000700 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000701 if (*f == '%') {
702 if (*(f+1)=='%')
703 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000704 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000705 ++callcount;
706 while (isdigit((unsigned)*f))
707 width = (width*10) + *f++ - '0';
708 while (*++f && *f != '%' && !isalpha((unsigned)*f))
709 ;
710 if (*f == 's')
711 ++callcount;
712 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000713 }
714 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000715 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000716 if (callcount) {
717 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
718 if (!callresults) {
719 PyErr_NoMemory();
720 return NULL;
721 }
722 callresult = callresults;
723 }
724 /* step 3: figure out how large a buffer we need */
725 for (f = format; *f; f++) {
726 if (*f == '%') {
727 const char* p = f;
728 width = 0;
729 while (isdigit((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 while (*++f && *f != '%' && !isalpha((unsigned)*f))
732 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000733
Benjamin Peterson857ce152009-01-31 16:29:18 +0000734 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
735 * they don't affect the amount of space we reserve.
736 */
737 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000738 (f[1] == 'd' || f[1] == 'u'))
739 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000740
Benjamin Peterson857ce152009-01-31 16:29:18 +0000741 switch (*f) {
742 case 'c':
743 (void)va_arg(count, int);
744 /* fall through... */
745 case '%':
746 n++;
747 break;
748 case 'd': case 'u': case 'i': case 'x':
749 (void) va_arg(count, int);
750 /* 20 bytes is enough to hold a 64-bit
751 integer. Decimal takes the most space.
752 This isn't enough for octal.
753 If a width is specified we need more
754 (which we allocate later). */
755 if (width < 20)
756 width = 20;
757 n += width;
758 if (abuffersize < width)
759 abuffersize = width;
760 break;
761 case 's':
762 {
763 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000764 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000765 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
766 if (!str)
767 goto fail;
768 n += PyUnicode_GET_SIZE(str);
769 /* Remember the str and switch to the next slot */
770 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000771 break;
772 }
773 case 'U':
774 {
775 PyObject *obj = va_arg(count, PyObject *);
776 assert(obj && PyUnicode_Check(obj));
777 n += PyUnicode_GET_SIZE(obj);
778 break;
779 }
780 case 'V':
781 {
782 PyObject *obj = va_arg(count, PyObject *);
783 const char *str = va_arg(count, const char *);
784 assert(obj || str);
785 assert(!obj || PyUnicode_Check(obj));
786 if (obj)
787 n += PyUnicode_GET_SIZE(obj);
788 else
789 n += strlen(str);
790 break;
791 }
792 case 'S':
793 {
794 PyObject *obj = va_arg(count, PyObject *);
795 PyObject *str;
796 assert(obj);
797 str = PyObject_Str(obj);
798 if (!str)
799 goto fail;
800 n += PyUnicode_GET_SIZE(str);
801 /* Remember the str and switch to the next slot */
802 *callresult++ = str;
803 break;
804 }
805 case 'R':
806 {
807 PyObject *obj = va_arg(count, PyObject *);
808 PyObject *repr;
809 assert(obj);
810 repr = PyObject_Repr(obj);
811 if (!repr)
812 goto fail;
813 n += PyUnicode_GET_SIZE(repr);
814 /* Remember the repr and switch to the next slot */
815 *callresult++ = repr;
816 break;
817 }
818 case 'p':
819 (void) va_arg(count, int);
820 /* maximum 64-bit pointer representation:
821 * 0xffffffffffffffff
822 * so 19 characters is enough.
823 * XXX I count 18 -- what's the extra for?
824 */
825 n += 19;
826 break;
827 default:
828 /* if we stumble upon an unknown
829 formatting code, copy the rest of
830 the format string to the output
831 string. (we cannot just skip the
832 code, since there's no way to know
833 what's in the argument list) */
834 n += strlen(p);
835 goto expand;
836 }
837 } else
838 n++;
839 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000840 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000841 if (abuffersize > 20) {
842 abuffer = PyObject_Malloc(abuffersize);
843 if (!abuffer) {
844 PyErr_NoMemory();
845 goto fail;
846 }
847 realbuffer = abuffer;
848 }
849 else
850 realbuffer = buffer;
851 /* step 4: fill the buffer */
852 /* Since we've analyzed how much space we need for the worst case,
853 we don't have to resize the string.
854 There can be no errors beyond this point. */
855 string = PyUnicode_FromUnicode(NULL, n);
856 if (!string)
857 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000858
Benjamin Peterson857ce152009-01-31 16:29:18 +0000859 s = PyUnicode_AS_UNICODE(string);
860 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000861
Benjamin Peterson857ce152009-01-31 16:29:18 +0000862 for (f = format; *f; f++) {
863 if (*f == '%') {
864 const char* p = f++;
865 int longflag = 0;
866 int size_tflag = 0;
867 zeropad = (*f == '0');
868 /* parse the width.precision part */
869 width = 0;
870 while (isdigit((unsigned)*f))
871 width = (width*10) + *f++ - '0';
872 precision = 0;
873 if (*f == '.') {
874 f++;
875 while (isdigit((unsigned)*f))
876 precision = (precision*10) + *f++ - '0';
877 }
878 /* handle the long flag, but only for %ld and %lu.
879 others can be added when necessary. */
880 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
881 longflag = 1;
882 ++f;
883 }
884 /* handle the size_t flag. */
885 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
886 size_tflag = 1;
887 ++f;
888 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000889
Benjamin Peterson857ce152009-01-31 16:29:18 +0000890 switch (*f) {
891 case 'c':
892 *s++ = va_arg(vargs, int);
893 break;
894 case 'd':
895 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
896 if (longflag)
897 sprintf(realbuffer, fmt, va_arg(vargs, long));
898 else if (size_tflag)
899 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
900 else
901 sprintf(realbuffer, fmt, va_arg(vargs, int));
902 appendstring(realbuffer);
903 break;
904 case 'u':
905 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
906 if (longflag)
907 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
908 else if (size_tflag)
909 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
910 else
911 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
912 appendstring(realbuffer);
913 break;
914 case 'i':
915 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
916 sprintf(realbuffer, fmt, va_arg(vargs, int));
917 appendstring(realbuffer);
918 break;
919 case 'x':
920 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
921 sprintf(realbuffer, fmt, va_arg(vargs, int));
922 appendstring(realbuffer);
923 break;
924 case 's':
925 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000926 /* unused, since we already have the result */
927 (void) va_arg(vargs, char *);
928 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
929 PyUnicode_GET_SIZE(*callresult));
930 s += PyUnicode_GET_SIZE(*callresult);
931 /* We're done with the unicode()/repr() => forget it */
932 Py_DECREF(*callresult);
933 /* switch to next unicode()/repr() result */
934 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000935 break;
936 }
937 case 'U':
938 {
939 PyObject *obj = va_arg(vargs, PyObject *);
940 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
941 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
942 s += size;
943 break;
944 }
945 case 'V':
946 {
947 PyObject *obj = va_arg(vargs, PyObject *);
948 const char *str = va_arg(vargs, const char *);
949 if (obj) {
950 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
951 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
952 s += size;
953 } else {
954 appendstring(str);
955 }
956 break;
957 }
958 case 'S':
959 case 'R':
960 {
961 Py_UNICODE *ucopy;
962 Py_ssize_t usize;
963 Py_ssize_t upos;
964 /* unused, since we already have the result */
965 (void) va_arg(vargs, PyObject *);
966 ucopy = PyUnicode_AS_UNICODE(*callresult);
967 usize = PyUnicode_GET_SIZE(*callresult);
968 for (upos = 0; upos<usize;)
969 *s++ = ucopy[upos++];
970 /* We're done with the unicode()/repr() => forget it */
971 Py_DECREF(*callresult);
972 /* switch to next unicode()/repr() result */
973 ++callresult;
974 break;
975 }
976 case 'p':
977 sprintf(buffer, "%p", va_arg(vargs, void*));
978 /* %p is ill-defined: ensure leading 0x. */
979 if (buffer[1] == 'X')
980 buffer[1] = 'x';
981 else if (buffer[1] != 'x') {
982 memmove(buffer+2, buffer, strlen(buffer)+1);
983 buffer[0] = '0';
984 buffer[1] = 'x';
985 }
986 appendstring(buffer);
987 break;
988 case '%':
989 *s++ = '%';
990 break;
991 default:
992 appendstring(p);
993 goto end;
994 }
995 } else
996 *s++ = *f;
997 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000998
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000999 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001000 if (callresults)
1001 PyObject_Free(callresults);
1002 if (abuffer)
1003 PyObject_Free(abuffer);
1004 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1005 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001006 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001007 if (callresults) {
1008 PyObject **callresult2 = callresults;
1009 while (callresult2 < callresult) {
1010 Py_DECREF(*callresult2);
1011 ++callresult2;
1012 }
1013 PyObject_Free(callresults);
1014 }
1015 if (abuffer)
1016 PyObject_Free(abuffer);
1017 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001018}
1019
1020#undef appendstring
1021
1022PyObject *
1023PyUnicode_FromFormat(const char *format, ...)
1024{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001025 PyObject* ret;
1026 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001027
1028#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001029 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001030#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001031 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001032#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001033 ret = PyUnicode_FromFormatV(format, vargs);
1034 va_end(vargs);
1035 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001036}
1037
Martin v. Löwis18e16552006-02-15 17:27:45 +00001038Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001039 wchar_t *w,
1040 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041{
1042 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001043 PyErr_BadInternalCall();
1044 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001046
1047 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001048 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001049 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001050
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051#ifdef HAVE_USABLE_WCHAR_T
1052 memcpy(w, unicode->str, size * sizeof(wchar_t));
1053#else
1054 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001055 register Py_UNICODE *u;
1056 register Py_ssize_t i;
1057 u = PyUnicode_AS_UNICODE(unicode);
1058 for (i = size; i > 0; i--)
1059 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 }
1061#endif
1062
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001063 if (size > PyUnicode_GET_SIZE(unicode))
1064 return PyUnicode_GET_SIZE(unicode);
1065 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001066 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067}
1068
1069#endif
1070
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001071PyObject *PyUnicode_FromOrdinal(int ordinal)
1072{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001073 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001074
1075#ifdef Py_UNICODE_WIDE
1076 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001077 PyErr_SetString(PyExc_ValueError,
1078 "unichr() arg not in range(0x110000) "
1079 "(wide Python build)");
1080 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001081 }
1082#else
1083 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001084 PyErr_SetString(PyExc_ValueError,
1085 "unichr() arg not in range(0x10000) "
1086 "(narrow Python build)");
1087 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001088 }
1089#endif
1090
Hye-Shik Chang40574832004-04-06 07:24:51 +00001091 s[0] = (Py_UNICODE)ordinal;
1092 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001093}
1094
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095PyObject *PyUnicode_FromObject(register PyObject *obj)
1096{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001097 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001098 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001100 Py_INCREF(obj);
1101 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001102 }
1103 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001104 /* For a Unicode subtype that's not a Unicode object,
1105 return a true Unicode object with the same data. */
1106 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1107 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001108 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1110}
1111
1112PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001113 const char *encoding,
1114 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001115{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001116 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001117 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001118 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001119
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001121 PyErr_BadInternalCall();
1122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001124
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001125#if 0
1126 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001127 that no encodings is given and then redirect to
1128 PyObject_Unicode() which then applies the additional logic for
1129 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001130
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001131 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001132 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001133
1134 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001135 if (PyUnicode_Check(obj)) {
1136 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001137 PyErr_SetString(PyExc_TypeError,
1138 "decoding Unicode is not supported");
1139 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001140 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001141 return PyObject_Unicode(obj);
1142 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001143#else
1144 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001145 PyErr_SetString(PyExc_TypeError,
1146 "decoding Unicode is not supported");
1147 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001148 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001149#endif
1150
1151 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001152 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001153 s = PyString_AS_STRING(obj);
1154 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001155 }
Christian Heimes3497f942008-05-26 12:29:14 +00001156 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001157 /* Python 2.x specific */
1158 PyErr_Format(PyExc_TypeError,
1159 "decoding bytearray is not supported");
1160 return NULL;
1161 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001162 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001163 /* Overwrite the error message with something more useful in
1164 case of a TypeError. */
1165 if (PyErr_ExceptionMatches(PyExc_TypeError))
1166 PyErr_Format(PyExc_TypeError,
1167 "coercing to Unicode: need string or buffer, "
1168 "%.80s found",
1169 Py_TYPE(obj)->tp_name);
1170 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001171 }
Tim Petersced69f82003-09-16 20:30:58 +00001172
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001173 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001174 if (len == 0)
1175 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001176
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001177 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001178 return v;
1179
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001180 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001181 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182}
1183
1184PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001185 Py_ssize_t size,
1186 const char *encoding,
1187 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188{
1189 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001190
1191 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001192 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001193
1194 /* Shortcuts for common default encodings */
1195 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001197 else if (strcmp(encoding, "latin-1") == 0)
1198 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001199#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1200 else if (strcmp(encoding, "mbcs") == 0)
1201 return PyUnicode_DecodeMBCS(s, size, errors);
1202#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001203 else if (strcmp(encoding, "ascii") == 0)
1204 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205
1206 /* Decode via the codec registry */
1207 buffer = PyBuffer_FromMemory((void *)s, size);
1208 if (buffer == NULL)
1209 goto onError;
1210 unicode = PyCodec_Decode(buffer, encoding, errors);
1211 if (unicode == NULL)
1212 goto onError;
1213 if (!PyUnicode_Check(unicode)) {
1214 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001215 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001216 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 Py_DECREF(unicode);
1218 goto onError;
1219 }
1220 Py_DECREF(buffer);
1221 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001222
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001223 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 Py_XDECREF(buffer);
1225 return NULL;
1226}
1227
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001228PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1229 const char *encoding,
1230 const char *errors)
1231{
1232 PyObject *v;
1233
1234 if (!PyUnicode_Check(unicode)) {
1235 PyErr_BadArgument();
1236 goto onError;
1237 }
1238
1239 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001240 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001241
1242 /* Decode via the codec registry */
1243 v = PyCodec_Decode(unicode, encoding, errors);
1244 if (v == NULL)
1245 goto onError;
1246 return v;
1247
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001248 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001249 return NULL;
1250}
1251
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001253 Py_ssize_t size,
1254 const char *encoding,
1255 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256{
1257 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001258
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 unicode = PyUnicode_FromUnicode(s, size);
1260 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1263 Py_DECREF(unicode);
1264 return v;
1265}
1266
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001267PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1268 const char *encoding,
1269 const char *errors)
1270{
1271 PyObject *v;
1272
1273 if (!PyUnicode_Check(unicode)) {
1274 PyErr_BadArgument();
1275 goto onError;
1276 }
1277
1278 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001279 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001280
1281 /* Encode via the codec registry */
1282 v = PyCodec_Encode(unicode, encoding, errors);
1283 if (v == NULL)
1284 goto onError;
1285 return v;
1286
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001287 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001288 return NULL;
1289}
1290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1292 const char *encoding,
1293 const char *errors)
1294{
1295 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001296
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 if (!PyUnicode_Check(unicode)) {
1298 PyErr_BadArgument();
1299 goto onError;
1300 }
Fred Drakee4315f52000-05-09 19:53:39 +00001301
Tim Petersced69f82003-09-16 20:30:58 +00001302 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001303 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001304
1305 /* Shortcuts for common default encodings */
1306 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001307 if (strcmp(encoding, "utf-8") == 0)
1308 return PyUnicode_AsUTF8String(unicode);
1309 else if (strcmp(encoding, "latin-1") == 0)
1310 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001311#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001312 else if (strcmp(encoding, "mbcs") == 0)
1313 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001314#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001315 else if (strcmp(encoding, "ascii") == 0)
1316 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318
1319 /* Encode via the codec registry */
1320 v = PyCodec_Encode(unicode, encoding, errors);
1321 if (v == NULL)
1322 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001323 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001325 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001326 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 Py_DECREF(v);
1328 goto onError;
1329 }
1330 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001331
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001332 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333 return NULL;
1334}
1335
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001336PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001337 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001338{
1339 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1340
1341 if (v)
1342 return v;
1343 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1344 if (v && errors == NULL)
1345 ((PyUnicodeObject *)unicode)->defenc = v;
1346 return v;
1347}
1348
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1350{
1351 if (!PyUnicode_Check(unicode)) {
1352 PyErr_BadArgument();
1353 goto onError;
1354 }
1355 return PyUnicode_AS_UNICODE(unicode);
1356
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001357 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358 return NULL;
1359}
1360
Martin v. Löwis18e16552006-02-15 17:27:45 +00001361Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362{
1363 if (!PyUnicode_Check(unicode)) {
1364 PyErr_BadArgument();
1365 goto onError;
1366 }
1367 return PyUnicode_GET_SIZE(unicode);
1368
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001369 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 return -1;
1371}
1372
Thomas Wouters78890102000-07-22 19:25:51 +00001373const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001374{
1375 return unicode_default_encoding;
1376}
1377
1378int PyUnicode_SetDefaultEncoding(const char *encoding)
1379{
1380 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001381
Fred Drakee4315f52000-05-09 19:53:39 +00001382 /* Make sure the encoding is valid. As side effect, this also
1383 loads the encoding into the codec registry cache. */
1384 v = _PyCodec_Lookup(encoding);
1385 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001386 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001387 Py_DECREF(v);
1388 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001390 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001391 return 0;
1392
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001393 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001394 return -1;
1395}
1396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397/* error handling callback helper:
1398 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001399 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001400 and adjust various state variables.
1401 return 0 on success, -1 on error
1402*/
1403
1404static
1405int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001406 const char *encoding, const char *reason,
1407 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1408 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1409 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001412
1413 PyObject *restuple = NULL;
1414 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1416 Py_ssize_t requiredsize;
1417 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001418 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001419 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 int res = -1;
1421
1422 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001423 *errorHandler = PyCodec_LookupError(errors);
1424 if (*errorHandler == NULL)
1425 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426 }
1427
1428 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001429 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001430 encoding, input, insize, *startinpos, *endinpos, reason);
1431 if (*exceptionObject == NULL)
1432 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433 }
1434 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001435 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1436 goto onError;
1437 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1438 goto onError;
1439 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1440 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001441 }
1442
1443 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1444 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001445 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001447 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001448 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 }
1450 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001451 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001453 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001454 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001455 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1456 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001458
1459 /* need more space? (at least enough for what we
1460 have+the replacement+the rest of the string (starting
1461 at the new input position), so we won't have to check space
1462 when there are no errors in the rest of the string) */
1463 repptr = PyUnicode_AS_UNICODE(repunicode);
1464 repsize = PyUnicode_GET_SIZE(repunicode);
1465 requiredsize = *outpos + repsize + insize-newpos;
1466 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001467 if (requiredsize<2*outsize)
1468 requiredsize = 2*outsize;
1469 if (_PyUnicode_Resize(output, requiredsize) < 0)
1470 goto onError;
1471 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 }
1473 *endinpos = newpos;
1474 *inptr = input + newpos;
1475 Py_UNICODE_COPY(*outptr, repptr, repsize);
1476 *outptr += repsize;
1477 *outpos += repsize;
1478 /* we made it! */
1479 res = 0;
1480
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001481 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001482 Py_XDECREF(restuple);
1483 return res;
1484}
1485
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001486/* --- UTF-7 Codec -------------------------------------------------------- */
1487
Antoine Pitrou653dece2009-05-04 18:32:32 +00001488/* See RFC2152 for details. We encode conservatively and decode liberally. */
1489
1490/* Three simple macros defining base-64. */
1491
1492/* Is c a base-64 character? */
1493
1494#define IS_BASE64(c) \
1495 (isalnum(c) || (c) == '+' || (c) == '/')
1496
1497/* given that c is a base-64 character, what is its base-64 value? */
1498
1499#define FROM_BASE64(c) \
1500 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1501 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1502 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1503 (c) == '+' ? 62 : 63)
1504
1505/* What is the base-64 character of the bottom 6 bits of n? */
1506
1507#define TO_BASE64(n) \
1508 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1509
1510/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1511 * decoded as itself. We are permissive on decoding; the only ASCII
1512 * byte not decoding to itself is the + which begins a base64
1513 * string. */
1514
1515#define DECODE_DIRECT(c) \
1516 ((c) <= 127 && (c) != '+')
1517
1518/* The UTF-7 encoder treats ASCII characters differently according to
1519 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1520 * the above). See RFC2152. This array identifies these different
1521 * sets:
1522 * 0 : "Set D"
1523 * alphanumeric and '(),-./:?
1524 * 1 : "Set O"
1525 * !"#$%&*;<=>@[]^_`{|}
1526 * 2 : "whitespace"
1527 * ht nl cr sp
1528 * 3 : special (must be base64 encoded)
1529 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1530 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001531
Tim Petersced69f82003-09-16 20:30:58 +00001532static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001533char utf7_category[128] = {
1534/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1535 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1536/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1537 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1538/* sp ! " # $ % & ' ( ) * + , - . / */
1539 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1540/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1542/* @ A B C D E F G H I J K L M N O */
1543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1544/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1546/* ` a b c d e f g h i j k l m n o */
1547 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1548/* p q r s t u v w x y z { | } ~ del */
1549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550};
1551
Antoine Pitrou653dece2009-05-04 18:32:32 +00001552/* ENCODE_DIRECT: this character should be encoded as itself. The
1553 * answer depends on whether we are encoding set O as itself, and also
1554 * on whether we are encoding whitespace as itself. RFC2152 makes it
1555 * clear that the answers to these questions vary between
1556 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001557
Antoine Pitrou653dece2009-05-04 18:32:32 +00001558#define ENCODE_DIRECT(c, directO, directWS) \
1559 ((c) < 128 && (c) > 0 && \
1560 ((utf7_category[(c)] == 0) || \
1561 (directWS && (utf7_category[(c)] == 2)) || \
1562 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001564PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001565 Py_ssize_t size,
1566 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001568 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1569}
1570
Antoine Pitrou653dece2009-05-04 18:32:32 +00001571/* The decoder. The only state we preserve is our read position,
1572 * i.e. how many characters we have consumed. So if we end in the
1573 * middle of a shift sequence we have to back off the read position
1574 * and the output to the beginning of the sequence, otherwise we lose
1575 * all the shift state (seen bits, number of bits seen, high
1576 * surrogate). */
1577
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001578PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001579 Py_ssize_t size,
1580 const char *errors,
1581 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001582{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001583 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001584 Py_ssize_t startinpos;
1585 Py_ssize_t endinpos;
1586 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 const char *e;
1588 PyUnicodeObject *unicode;
1589 Py_UNICODE *p;
1590 const char *errmsg = "";
1591 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001592 Py_UNICODE *shiftOutStart;
1593 unsigned int base64bits = 0;
1594 unsigned long base64buffer = 0;
1595 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 PyObject *errorHandler = NULL;
1597 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598
1599 unicode = _PyUnicode_New(size);
1600 if (!unicode)
1601 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001602 if (size == 0) {
1603 if (consumed)
1604 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001605 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001606 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607
1608 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 e = s + size;
1611
1612 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001613 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001614
Antoine Pitrou653dece2009-05-04 18:32:32 +00001615 if (inShift) { /* in a base-64 section */
1616 if (IS_BASE64(ch)) { /* consume a base-64 character */
1617 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1618 base64bits += 6;
1619 s++;
1620 if (base64bits >= 16) {
1621 /* we have enough bits for a UTF-16 value */
1622 Py_UNICODE outCh = (Py_UNICODE)
1623 (base64buffer >> (base64bits-16));
1624 base64bits -= 16;
1625 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1626 if (surrogate) {
1627 /* expecting a second surrogate */
1628 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1629#ifdef Py_UNICODE_WIDE
1630 *p++ = (((surrogate & 0x3FF)<<10)
1631 | (outCh & 0x3FF)) + 0x10000;
1632#else
1633 *p++ = surrogate;
1634 *p++ = outCh;
1635#endif
1636 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001637 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001638 }
1639 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001640 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001641 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001642 }
1643 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001644 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001645 /* first surrogate */
1646 surrogate = outCh;
1647 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001648 else {
1649 *p++ = outCh;
1650 }
1651 }
1652 }
1653 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 inShift = 0;
1655 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001656 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001657 *p++ = surrogate;
1658 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 if (base64bits > 0) { /* left-over bits */
1661 if (base64bits >= 6) {
1662 /* We've seen at least one base-64 character */
1663 errmsg = "partial character in shift sequence";
1664 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001666 else {
1667 /* Some bits remain; they should be zero */
1668 if (base64buffer != 0) {
1669 errmsg = "non-zero padding bits in shift sequence";
1670 goto utf7Error;
1671 }
1672 }
1673 }
1674 if (ch != '-') {
1675 /* '-' is absorbed; other terminating
1676 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 *p++ = ch;
1678 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 }
1680 }
1681 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001682 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001683 s++; /* consume '+' */
1684 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 s++;
1686 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001687 }
1688 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 shiftOutStart = p;
1691 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 }
1693 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001694 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 *p++ = ch;
1696 s++;
1697 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001698 else {
1699 startinpos = s-starts;
1700 s++;
1701 errmsg = "unexpected special character";
1702 goto utf7Error;
1703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001704 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 outpos = p-PyUnicode_AS_UNICODE(unicode);
1707 endinpos = s-starts;
1708 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001709 errors, &errorHandler,
1710 "utf7", errmsg,
1711 starts, size, &startinpos, &endinpos, &exc, &s,
1712 &unicode, &outpos, &p))
1713 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 }
1715
Antoine Pitrou653dece2009-05-04 18:32:32 +00001716 /* end of string */
1717
1718 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1719 /* if we're in an inconsistent state, that's an error */
1720 if (surrogate ||
1721 (base64bits >= 6) ||
1722 (base64bits > 0 && base64buffer != 0)) {
1723 outpos = p-PyUnicode_AS_UNICODE(unicode);
1724 endinpos = size;
1725 if (unicode_decode_call_errorhandler(
1726 errors, &errorHandler,
1727 "utf7", "unterminated shift sequence",
1728 starts, size, &startinpos, &endinpos, &exc, &s,
1729 &unicode, &outpos, &p))
1730 goto onError;
1731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733
1734 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001735 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 if (inShift) {
1737 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001738 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001739 }
1740 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001741 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001742 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001743 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001745 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 goto onError;
1747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 Py_XDECREF(errorHandler);
1749 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 return (PyObject *)unicode;
1751
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 Py_XDECREF(errorHandler);
1754 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 Py_DECREF(unicode);
1756 return NULL;
1757}
1758
1759
1760PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001761 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001762 int base64SetO,
1763 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001764 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765{
1766 PyObject *v;
1767 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001768 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001769 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001771 unsigned int base64bits = 0;
1772 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 char * out;
1774 char * start;
1775
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001776 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001777 return PyErr_NoMemory();
1778
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001780 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781
Antoine Pitrou653dece2009-05-04 18:32:32 +00001782 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783 if (v == NULL)
1784 return NULL;
1785
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001786 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 for (;i < size; ++i) {
1788 Py_UNICODE ch = s[i];
1789
Antoine Pitrou653dece2009-05-04 18:32:32 +00001790 if (inShift) {
1791 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1792 /* shifting out */
1793 if (base64bits) { /* output remaining bits */
1794 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1795 base64buffer = 0;
1796 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
1798 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001799 /* Characters not in the BASE64 set implicitly unshift the sequence
1800 so no '-' is required, except if the character is itself a '-' */
1801 if (IS_BASE64(ch) || ch == '-') {
1802 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 *out++ = (char) ch;
1805 }
1806 else {
1807 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001808 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001809 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001810 else { /* not in a shift sequence */
1811 if (ch == '+') {
1812 *out++ = '+';
1813 *out++ = '-';
1814 }
1815 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1816 *out++ = (char) ch;
1817 }
1818 else {
1819 *out++ = '+';
1820 inShift = 1;
1821 goto encode_char;
1822 }
1823 }
1824 continue;
1825encode_char:
1826#ifdef Py_UNICODE_WIDE
1827 if (ch >= 0x10000) {
1828 /* code first surrogate */
1829 base64bits += 16;
1830 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1831 while (base64bits >= 6) {
1832 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1833 base64bits -= 6;
1834 }
1835 /* prepare second surrogate */
1836 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1837 }
1838#endif
1839 base64bits += 16;
1840 base64buffer = (base64buffer << 16) | ch;
1841 while (base64bits >= 6) {
1842 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1843 base64bits -= 6;
1844 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001845 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001846 if (base64bits)
1847 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1848 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001849 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001851 if (_PyString_Resize(&v, out - start))
1852 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 return v;
1854}
1855
Antoine Pitrou653dece2009-05-04 18:32:32 +00001856#undef IS_BASE64
1857#undef FROM_BASE64
1858#undef TO_BASE64
1859#undef DECODE_DIRECT
1860#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001861
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862/* --- UTF-8 Codec -------------------------------------------------------- */
1863
Tim Petersced69f82003-09-16 20:30:58 +00001864static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001866 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1867 illegal prefix. See RFC 3629 for details */
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001875 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1880 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1881 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1882 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1883 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884};
1885
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001887 Py_ssize_t size,
1888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889{
Walter Dörwald69652032004-09-07 20:24:22 +00001890 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1891}
1892
1893PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001894 Py_ssize_t size,
1895 const char *errors,
1896 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001900 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001901 Py_ssize_t startinpos;
1902 Py_ssize_t endinpos;
1903 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 const char *e;
1905 PyUnicodeObject *unicode;
1906 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001907 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001908 PyObject *errorHandler = NULL;
1909 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910
1911 /* Note: size will always be longer than the resulting Unicode
1912 character count */
1913 unicode = _PyUnicode_New(size);
1914 if (!unicode)
1915 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001916 if (size == 0) {
1917 if (consumed)
1918 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001920 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921
1922 /* Unpack UTF-8 encoded data */
1923 p = unicode->str;
1924 e = s + size;
1925
1926 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001927 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928
1929 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001930 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 s++;
1932 continue;
1933 }
1934
1935 n = utf8_code_length[ch];
1936
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001938 if (consumed)
1939 break;
1940 else {
1941 errmsg = "unexpected end of data";
1942 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001943 endinpos = startinpos+1;
1944 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1945 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001946 goto utf8Error;
1947 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949
1950 switch (n) {
1951
1952 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001953 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957
1958 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001960 startinpos = s-starts;
1961 endinpos = startinpos+1;
1962 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963
1964 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001965 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001966 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001967 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001968 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001969 goto utf8Error;
1970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001972 assert ((ch > 0x007F) && (ch <= 0x07FF));
1973 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 break;
1975
1976 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001977 /* XXX: surrogates shouldn't be valid UTF-8!
1978 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1979 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1980 Uncomment the 2 lines below to make them invalid,
1981 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001982 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00001983 (s[2] & 0xc0) != 0x80 ||
1984 ((unsigned char)s[0] == 0xE0 &&
1985 (unsigned char)s[1] < 0xA0)/* ||
1986 ((unsigned char)s[0] == 0xED &&
1987 (unsigned char)s[1] > 0x9F)*/) {
1988 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001989 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001990 endinpos = startinpos + 1;
1991
1992 /* if s[1] first two bits are 1 and 0, then the invalid
1993 continuation byte is s[2], so increment endinpos by 1,
1994 if not, s[1] is invalid and endinpos doesn't need to
1995 be incremented. */
1996 if ((s[1] & 0xC0) == 0x80)
1997 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001998 goto utf8Error;
1999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002001 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2002 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002003 break;
2004
2005 case 4:
2006 if ((s[1] & 0xc0) != 0x80 ||
2007 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002008 (s[3] & 0xc0) != 0x80 ||
2009 ((unsigned char)s[0] == 0xF0 &&
2010 (unsigned char)s[1] < 0x90) ||
2011 ((unsigned char)s[0] == 0xF4 &&
2012 (unsigned char)s[1] > 0x8F)) {
2013 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002014 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002015 endinpos = startinpos + 1;
2016 if ((s[1] & 0xC0) == 0x80) {
2017 endinpos++;
2018 if ((s[2] & 0xC0) == 0x80)
2019 endinpos++;
2020 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002021 goto utf8Error;
2022 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002023 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002024 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2025 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2026
Fredrik Lundh8f455852001-06-27 18:59:43 +00002027#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002028 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002029#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002030 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 /* translate from 10000..10FFFF to 0..FFFF */
2033 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002034
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002035 /* high surrogate = top 10 bits added to D800 */
2036 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002037
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002038 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002039 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002040#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 }
2043 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002044 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002045
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002046 utf8Error:
2047 outpos = p-PyUnicode_AS_UNICODE(unicode);
2048 if (unicode_decode_call_errorhandler(
2049 errors, &errorHandler,
2050 "utf8", errmsg,
2051 starts, size, &startinpos, &endinpos, &exc, &s,
2052 &unicode, &outpos, &p))
2053 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 }
Walter Dörwald69652032004-09-07 20:24:22 +00002055 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002056 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057
2058 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002059 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 goto onError;
2061
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 Py_XDECREF(errorHandler);
2063 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064 return (PyObject *)unicode;
2065
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002066 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 Py_XDECREF(errorHandler);
2068 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 Py_DECREF(unicode);
2070 return NULL;
2071}
2072
Tim Peters602f7402002-04-27 18:03:26 +00002073/* Allocation strategy: if the string is short, convert into a stack buffer
2074 and allocate exactly as much space needed at the end. Else allocate the
2075 maximum possible needed (4 result bytes per Unicode character), and return
2076 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002077*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002078PyObject *
2079PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002080 Py_ssize_t size,
2081 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082{
Tim Peters602f7402002-04-27 18:03:26 +00002083#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002084
Martin v. Löwis18e16552006-02-15 17:27:45 +00002085 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002086 PyObject *v; /* result string object */
2087 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002088 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002089 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002090 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002091
Tim Peters602f7402002-04-27 18:03:26 +00002092 assert(s != NULL);
2093 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094
Tim Peters602f7402002-04-27 18:03:26 +00002095 if (size <= MAX_SHORT_UNICHARS) {
2096 /* Write into the stack buffer; nallocated can't overflow.
2097 * At the end, we'll allocate exactly as much heap space as it
2098 * turns out we need.
2099 */
2100 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2101 v = NULL; /* will allocate after we're done */
2102 p = stackbuf;
2103 }
2104 else {
2105 /* Overallocate on the heap, and give the excess back at the end. */
2106 nallocated = size * 4;
2107 if (nallocated / 4 != size) /* overflow! */
2108 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002109 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002110 if (v == NULL)
2111 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002112 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002113 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002114
Tim Peters602f7402002-04-27 18:03:26 +00002115 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002116 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002117
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002118 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002119 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002123 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002124 *p++ = (char)(0xc0 | (ch >> 6));
2125 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002126 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002127 else {
Tim Peters602f7402002-04-27 18:03:26 +00002128 /* Encode UCS2 Unicode ordinals */
2129 if (ch < 0x10000) {
2130 /* Special case: check for high surrogate */
2131 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2132 Py_UCS4 ch2 = s[i];
2133 /* Check for low surrogate and combine the two to
2134 form a UCS4 value */
2135 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002136 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002137 i++;
2138 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002139 }
Tim Peters602f7402002-04-27 18:03:26 +00002140 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002141 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002142 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002143 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2144 *p++ = (char)(0x80 | (ch & 0x3f));
2145 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002146 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002147 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002148 /* Encode UCS4 Unicode ordinals */
2149 *p++ = (char)(0xf0 | (ch >> 18));
2150 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2151 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2152 *p++ = (char)(0x80 | (ch & 0x3f));
2153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002155
Tim Peters602f7402002-04-27 18:03:26 +00002156 if (v == NULL) {
2157 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002158 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002160 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002161 }
2162 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002163 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002164 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002165 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002166 if (_PyString_Resize(&v, nneeded))
2167 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002170
Tim Peters602f7402002-04-27 18:03:26 +00002171#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172}
2173
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2175{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 if (!PyUnicode_Check(unicode)) {
2177 PyErr_BadArgument();
2178 return NULL;
2179 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002180 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002181 PyUnicode_GET_SIZE(unicode),
2182 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183}
2184
Walter Dörwald6e390802007-08-17 16:41:28 +00002185/* --- UTF-32 Codec ------------------------------------------------------- */
2186
2187PyObject *
2188PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002189 Py_ssize_t size,
2190 const char *errors,
2191 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002192{
2193 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2194}
2195
2196PyObject *
2197PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002198 Py_ssize_t size,
2199 const char *errors,
2200 int *byteorder,
2201 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002202{
2203 const char *starts = s;
2204 Py_ssize_t startinpos;
2205 Py_ssize_t endinpos;
2206 Py_ssize_t outpos;
2207 PyUnicodeObject *unicode;
2208 Py_UNICODE *p;
2209#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002210 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002211 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002212#else
2213 const int pairs = 0;
2214#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002215 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002216 int bo = 0; /* assume native ordering by default */
2217 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002218 /* Offsets from q for retrieving bytes in the right order. */
2219#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2220 int iorder[] = {0, 1, 2, 3};
2221#else
2222 int iorder[] = {3, 2, 1, 0};
2223#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002224 PyObject *errorHandler = NULL;
2225 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002226
Walter Dörwald6e390802007-08-17 16:41:28 +00002227 q = (unsigned char *)s;
2228 e = q + size;
2229
2230 if (byteorder)
2231 bo = *byteorder;
2232
2233 /* Check for BOM marks (U+FEFF) in the input and adjust current
2234 byte order setting accordingly. In native mode, the leading BOM
2235 mark is skipped, in all other modes, it is copied to the output
2236 stream as-is (giving a ZWNBSP character). */
2237 if (bo == 0) {
2238 if (size >= 4) {
2239 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002240 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002241#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002242 if (bom == 0x0000FEFF) {
2243 q += 4;
2244 bo = -1;
2245 }
2246 else if (bom == 0xFFFE0000) {
2247 q += 4;
2248 bo = 1;
2249 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002250#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002251 if (bom == 0x0000FEFF) {
2252 q += 4;
2253 bo = 1;
2254 }
2255 else if (bom == 0xFFFE0000) {
2256 q += 4;
2257 bo = -1;
2258 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002259#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002260 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002261 }
2262
2263 if (bo == -1) {
2264 /* force LE */
2265 iorder[0] = 0;
2266 iorder[1] = 1;
2267 iorder[2] = 2;
2268 iorder[3] = 3;
2269 }
2270 else if (bo == 1) {
2271 /* force BE */
2272 iorder[0] = 3;
2273 iorder[1] = 2;
2274 iorder[2] = 1;
2275 iorder[3] = 0;
2276 }
2277
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002278 /* On narrow builds we split characters outside the BMP into two
2279 codepoints => count how much extra space we need. */
2280#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002281 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002282 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2283 pairs++;
2284#endif
2285
2286 /* This might be one to much, because of a BOM */
2287 unicode = _PyUnicode_New((size+3)/4+pairs);
2288 if (!unicode)
2289 return NULL;
2290 if (size == 0)
2291 return (PyObject *)unicode;
2292
2293 /* Unpack UTF-32 encoded data */
2294 p = unicode->str;
2295
Walter Dörwald6e390802007-08-17 16:41:28 +00002296 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002297 Py_UCS4 ch;
2298 /* remaining bytes at the end? (size should be divisible by 4) */
2299 if (e-q<4) {
2300 if (consumed)
2301 break;
2302 errmsg = "truncated data";
2303 startinpos = ((const char *)q)-starts;
2304 endinpos = ((const char *)e)-starts;
2305 goto utf32Error;
2306 /* The remaining input chars are ignored if the callback
2307 chooses to skip the input */
2308 }
2309 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2310 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002311
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002312 if (ch >= 0x110000)
2313 {
2314 errmsg = "codepoint not in range(0x110000)";
2315 startinpos = ((const char *)q)-starts;
2316 endinpos = startinpos+4;
2317 goto utf32Error;
2318 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002319#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002320 if (ch >= 0x10000)
2321 {
2322 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2323 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2324 }
2325 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002326#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002327 *p++ = ch;
2328 q += 4;
2329 continue;
2330 utf32Error:
2331 outpos = p-PyUnicode_AS_UNICODE(unicode);
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002335 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002336 &unicode, &outpos, &p))
2337 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002338 }
2339
2340 if (byteorder)
2341 *byteorder = bo;
2342
2343 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002344 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002345
2346 /* Adjust length */
2347 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2348 goto onError;
2349
2350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
2352 return (PyObject *)unicode;
2353
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002354 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002355 Py_DECREF(unicode);
2356 Py_XDECREF(errorHandler);
2357 Py_XDECREF(exc);
2358 return NULL;
2359}
2360
2361PyObject *
2362PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002363 Py_ssize_t size,
2364 const char *errors,
2365 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002366{
2367 PyObject *v;
2368 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002369 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002370#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002371 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002372#else
2373 const int pairs = 0;
2374#endif
2375 /* Offsets from p for storing byte pairs in the right order. */
2376#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2377 int iorder[] = {0, 1, 2, 3};
2378#else
2379 int iorder[] = {3, 2, 1, 0};
2380#endif
2381
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002382#define STORECHAR(CH) \
2383 do { \
2384 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2385 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2386 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2387 p[iorder[0]] = (CH) & 0xff; \
2388 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002389 } while(0)
2390
2391 /* In narrow builds we can output surrogate pairs as one codepoint,
2392 so we need less space. */
2393#ifndef Py_UNICODE_WIDE
2394 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002395 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2396 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2397 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002398#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002399 nsize = (size - pairs + (byteorder == 0));
2400 bytesize = nsize * 4;
2401 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002402 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002403 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002404 if (v == NULL)
2405 return NULL;
2406
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002407 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002408 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002409 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002410 if (size == 0)
2411 return v;
2412
2413 if (byteorder == -1) {
2414 /* force LE */
2415 iorder[0] = 0;
2416 iorder[1] = 1;
2417 iorder[2] = 2;
2418 iorder[3] = 3;
2419 }
2420 else if (byteorder == 1) {
2421 /* force BE */
2422 iorder[0] = 3;
2423 iorder[1] = 2;
2424 iorder[2] = 1;
2425 iorder[3] = 0;
2426 }
2427
2428 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002429 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002430#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002431 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2432 Py_UCS4 ch2 = *s;
2433 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2434 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2435 s++;
2436 size--;
2437 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002438 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002439#endif
2440 STORECHAR(ch);
2441 }
2442 return v;
2443#undef STORECHAR
2444}
2445
2446PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2447{
2448 if (!PyUnicode_Check(unicode)) {
2449 PyErr_BadArgument();
2450 return NULL;
2451 }
2452 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002453 PyUnicode_GET_SIZE(unicode),
2454 NULL,
2455 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002456}
2457
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458/* --- UTF-16 Codec ------------------------------------------------------- */
2459
Tim Peters772747b2001-08-09 22:21:55 +00002460PyObject *
2461PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002462 Py_ssize_t size,
2463 const char *errors,
2464 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465{
Walter Dörwald69652032004-09-07 20:24:22 +00002466 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2467}
2468
2469PyObject *
2470PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002471 Py_ssize_t size,
2472 const char *errors,
2473 int *byteorder,
2474 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002475{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002476 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002477 Py_ssize_t startinpos;
2478 Py_ssize_t endinpos;
2479 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 PyUnicodeObject *unicode;
2481 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002482 const unsigned char *q, *e;
2483 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002484 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002485 /* Offsets from q for retrieving byte pairs in the right order. */
2486#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2487 int ihi = 1, ilo = 0;
2488#else
2489 int ihi = 0, ilo = 1;
2490#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 PyObject *errorHandler = NULL;
2492 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493
2494 /* Note: size will always be longer than the resulting Unicode
2495 character count */
2496 unicode = _PyUnicode_New(size);
2497 if (!unicode)
2498 return NULL;
2499 if (size == 0)
2500 return (PyObject *)unicode;
2501
2502 /* Unpack UTF-16 encoded data */
2503 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002504 q = (unsigned char *)s;
2505 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506
2507 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002508 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002510 /* Check for BOM marks (U+FEFF) in the input and adjust current
2511 byte order setting accordingly. In native mode, the leading BOM
2512 mark is skipped, in all other modes, it is copied to the output
2513 stream as-is (giving a ZWNBSP character). */
2514 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002515 if (size >= 2) {
2516 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002517#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002518 if (bom == 0xFEFF) {
2519 q += 2;
2520 bo = -1;
2521 }
2522 else if (bom == 0xFFFE) {
2523 q += 2;
2524 bo = 1;
2525 }
Tim Petersced69f82003-09-16 20:30:58 +00002526#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002527 if (bom == 0xFEFF) {
2528 q += 2;
2529 bo = 1;
2530 }
2531 else if (bom == 0xFFFE) {
2532 q += 2;
2533 bo = -1;
2534 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002535#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002536 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538
Tim Peters772747b2001-08-09 22:21:55 +00002539 if (bo == -1) {
2540 /* force LE */
2541 ihi = 1;
2542 ilo = 0;
2543 }
2544 else if (bo == 1) {
2545 /* force BE */
2546 ihi = 0;
2547 ilo = 1;
2548 }
2549
2550 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002551 Py_UNICODE ch;
2552 /* remaining bytes at the end? (size should be even) */
2553 if (e-q<2) {
2554 if (consumed)
2555 break;
2556 errmsg = "truncated data";
2557 startinpos = ((const char *)q)-starts;
2558 endinpos = ((const char *)e)-starts;
2559 goto utf16Error;
2560 /* The remaining input chars are ignored if the callback
2561 chooses to skip the input */
2562 }
2563 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002564
Benjamin Peterson857ce152009-01-31 16:29:18 +00002565 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002566
2567 if (ch < 0xD800 || ch > 0xDFFF) {
2568 *p++ = ch;
2569 continue;
2570 }
2571
2572 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002573 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002574 q -= 2;
2575 if (consumed)
2576 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002577 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002578 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002579 endinpos = ((const char *)e)-starts;
2580 goto utf16Error;
2581 }
2582 if (0xD800 <= ch && ch <= 0xDBFF) {
2583 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2584 q += 2;
2585 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002586#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002587 *p++ = ch;
2588 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002589#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002590 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002591#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002592 continue;
2593 }
2594 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002595 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002596 startinpos = (((const char *)q)-4)-starts;
2597 endinpos = startinpos+2;
2598 goto utf16Error;
2599 }
2600
Benjamin Peterson857ce152009-01-31 16:29:18 +00002601 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002602 errmsg = "illegal encoding";
2603 startinpos = (((const char *)q)-2)-starts;
2604 endinpos = startinpos+2;
2605 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002606
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002607 utf16Error:
2608 outpos = p-PyUnicode_AS_UNICODE(unicode);
2609 if (unicode_decode_call_errorhandler(
2610 errors, &errorHandler,
2611 "utf16", errmsg,
2612 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2613 &unicode, &outpos, &p))
2614 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 }
2616
2617 if (byteorder)
2618 *byteorder = bo;
2619
Walter Dörwald69652032004-09-07 20:24:22 +00002620 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002621 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002622
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002624 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 goto onError;
2626
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 Py_XDECREF(errorHandler);
2628 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 return (PyObject *)unicode;
2630
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002631 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 Py_XDECREF(errorHandler);
2634 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 return NULL;
2636}
2637
Tim Peters772747b2001-08-09 22:21:55 +00002638PyObject *
2639PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002640 Py_ssize_t size,
2641 const char *errors,
2642 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643{
2644 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002645 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002646 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002647#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002648 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002649#else
2650 const int pairs = 0;
2651#endif
Tim Peters772747b2001-08-09 22:21:55 +00002652 /* Offsets from p for storing byte pairs in the right order. */
2653#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2654 int ihi = 1, ilo = 0;
2655#else
2656 int ihi = 0, ilo = 1;
2657#endif
2658
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002659#define STORECHAR(CH) \
2660 do { \
2661 p[ihi] = ((CH) >> 8) & 0xff; \
2662 p[ilo] = (CH) & 0xff; \
2663 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002664 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002666#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002667 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002668 if (s[i] >= 0x10000)
2669 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002670#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002671 /* 2 * (size + pairs + (byteorder == 0)) */
2672 if (size > PY_SSIZE_T_MAX ||
2673 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002674 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002675 nsize = size + pairs + (byteorder == 0);
2676 bytesize = nsize * 2;
2677 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002678 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002679 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 if (v == NULL)
2681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002683 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002685 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002686 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002687 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002688
2689 if (byteorder == -1) {
2690 /* force LE */
2691 ihi = 1;
2692 ilo = 0;
2693 }
2694 else if (byteorder == 1) {
2695 /* force BE */
2696 ihi = 0;
2697 ilo = 1;
2698 }
2699
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002700 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002701 Py_UNICODE ch = *s++;
2702 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002703#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002704 if (ch >= 0x10000) {
2705 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2706 ch = 0xD800 | ((ch-0x10000) >> 10);
2707 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002708#endif
Tim Peters772747b2001-08-09 22:21:55 +00002709 STORECHAR(ch);
2710 if (ch2)
2711 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002714#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715}
2716
2717PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2718{
2719 if (!PyUnicode_Check(unicode)) {
2720 PyErr_BadArgument();
2721 return NULL;
2722 }
2723 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002724 PyUnicode_GET_SIZE(unicode),
2725 NULL,
2726 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727}
2728
2729/* --- Unicode Escape Codec ----------------------------------------------- */
2730
Fredrik Lundh06d12682001-01-24 07:59:11 +00002731static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002732
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002734 Py_ssize_t size,
2735 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002738 Py_ssize_t startinpos;
2739 Py_ssize_t endinpos;
2740 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002745 char* message;
2746 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 PyObject *errorHandler = NULL;
2748 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002749
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 /* Escaped strings will always be longer than the resulting
2751 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002752 length after conversion to the true value.
2753 (but if the error callback returns a long replacement string
2754 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 v = _PyUnicode_New(size);
2756 if (v == NULL)
2757 goto onError;
2758 if (size == 0)
2759 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002761 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002763
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 while (s < end) {
2765 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002766 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768
2769 /* Non-escape characters are interpreted as Unicode ordinals */
2770 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002771 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 continue;
2773 }
2774
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002775 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 /* \ - Escapes */
2777 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002778 c = *s++;
2779 if (s > end)
2780 c = '\0'; /* Invalid after \ */
2781 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002783 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 case '\n': break;
2785 case '\\': *p++ = '\\'; break;
2786 case '\'': *p++ = '\''; break;
2787 case '\"': *p++ = '\"'; break;
2788 case 'b': *p++ = '\b'; break;
2789 case 'f': *p++ = '\014'; break; /* FF */
2790 case 't': *p++ = '\t'; break;
2791 case 'n': *p++ = '\n'; break;
2792 case 'r': *p++ = '\r'; break;
2793 case 'v': *p++ = '\013'; break; /* VT */
2794 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2795
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002796 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 case '0': case '1': case '2': case '3':
2798 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002799 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002800 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002801 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002802 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002803 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002805 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 break;
2807
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002808 /* hex escapes */
2809 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002811 digits = 2;
2812 message = "truncated \\xXX escape";
2813 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002815 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002817 digits = 4;
2818 message = "truncated \\uXXXX escape";
2819 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002821 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002822 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002823 digits = 8;
2824 message = "truncated \\UXXXXXXXX escape";
2825 hexescape:
2826 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 outpos = p-PyUnicode_AS_UNICODE(v);
2828 if (s+digits>end) {
2829 endinpos = size;
2830 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002831 errors, &errorHandler,
2832 "unicodeescape", "end of string in escape sequence",
2833 starts, size, &startinpos, &endinpos, &exc, &s,
2834 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 goto onError;
2836 goto nextByte;
2837 }
2838 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002839 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002840 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002841 endinpos = (s+i+1)-starts;
2842 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002843 errors, &errorHandler,
2844 "unicodeescape", message,
2845 starts, size, &startinpos, &endinpos, &exc, &s,
2846 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002847 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002848 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002849 }
2850 chr = (chr<<4) & ~0xF;
2851 if (c >= '0' && c <= '9')
2852 chr += c - '0';
2853 else if (c >= 'a' && c <= 'f')
2854 chr += 10 + c - 'a';
2855 else
2856 chr += 10 + c - 'A';
2857 }
2858 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002859 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002860 /* _decoding_error will have already written into the
2861 target buffer. */
2862 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002863 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002864 /* when we get here, chr is a 32-bit unicode character */
2865 if (chr <= 0xffff)
2866 /* UCS-2 character */
2867 *p++ = (Py_UNICODE) chr;
2868 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002869 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002870 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002871#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002872 *p++ = chr;
2873#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002874 chr -= 0x10000L;
2875 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002876 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002877#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002878 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 endinpos = s-starts;
2880 outpos = p-PyUnicode_AS_UNICODE(v);
2881 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002882 errors, &errorHandler,
2883 "unicodeescape", "illegal Unicode character",
2884 starts, size, &startinpos, &endinpos, &exc, &s,
2885 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002886 goto onError;
2887 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002888 break;
2889
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002890 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002891 case 'N':
2892 message = "malformed \\N character escape";
2893 if (ucnhash_CAPI == NULL) {
2894 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002895 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002896 if (ucnhash_CAPI == NULL)
2897 goto ucnhashError;
2898 }
2899 if (*s == '{') {
2900 const char *start = s+1;
2901 /* look for the closing brace */
2902 while (*s != '}' && s < end)
2903 s++;
2904 if (s > start && s < end && *s == '}') {
2905 /* found a name. look it up in the unicode database */
2906 message = "unknown Unicode character name";
2907 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002908 if (s - start - 1 <= INT_MAX &&
2909 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002910 goto store;
2911 }
2912 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 endinpos = s-starts;
2914 outpos = p-PyUnicode_AS_UNICODE(v);
2915 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002916 errors, &errorHandler,
2917 "unicodeescape", message,
2918 starts, size, &startinpos, &endinpos, &exc, &s,
2919 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002920 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002921 break;
2922
2923 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002924 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925 message = "\\ at end of string";
2926 s--;
2927 endinpos = s-starts;
2928 outpos = p-PyUnicode_AS_UNICODE(v);
2929 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002930 errors, &errorHandler,
2931 "unicodeescape", message,
2932 starts, size, &startinpos, &endinpos, &exc, &s,
2933 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002934 goto onError;
2935 }
2936 else {
2937 *p++ = '\\';
2938 *p++ = (unsigned char)s[-1];
2939 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002940 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002942 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002943 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002945 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002946 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002947 Py_XDECREF(errorHandler);
2948 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002950
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002951 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002952 PyErr_SetString(
2953 PyExc_UnicodeError,
2954 "\\N escapes not supported (can't load unicodedata module)"
2955 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002956 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 Py_XDECREF(errorHandler);
2958 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002959 return NULL;
2960
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002961 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 Py_XDECREF(errorHandler);
2964 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 return NULL;
2966}
2967
2968/* Return a Unicode-Escape string version of the Unicode object.
2969
2970 If quotes is true, the string is enclosed in u"" or u'' quotes as
2971 appropriate.
2972
2973*/
2974
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002975Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002976 Py_ssize_t size,
2977 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002978{
2979 /* like wcschr, but doesn't stop at NULL characters */
2980
2981 while (size-- > 0) {
2982 if (*s == ch)
2983 return s;
2984 s++;
2985 }
2986
2987 return NULL;
2988}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002989
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990static
2991PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002992 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 int quotes)
2994{
2995 PyObject *repr;
2996 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002998 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002999#ifdef Py_UNICODE_WIDE
3000 const Py_ssize_t expandsize = 10;
3001#else
3002 const Py_ssize_t expandsize = 6;
3003#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004
Neal Norwitz17753ec2006-08-21 22:21:19 +00003005 /* XXX(nnorwitz): rather than over-allocating, it would be
3006 better to choose a different scheme. Perhaps scan the
3007 first N-chars of the string and allocate based on that size.
3008 */
3009 /* Initial allocation is based on the longest-possible unichr
3010 escape.
3011
3012 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3013 unichr, so in this case it's the longest unichr escape. In
3014 narrow (UTF-16) builds this is five chars per source unichr
3015 since there are two unichrs in the surrogate pair, so in narrow
3016 (UTF-16) builds it's not the longest unichr escape.
3017
3018 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3019 so in the narrow (UTF-16) build case it's the longest unichr
3020 escape.
3021 */
3022
Neal Norwitze7d8be82008-07-31 17:17:14 +00003023 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003024 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003025
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003026 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003027 2
3028 + expandsize*size
3029 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 if (repr == NULL)
3031 return NULL;
3032
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003033 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034
3035 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003037 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 !findchar(s, size, '"')) ? '"' : '\'';
3039 }
3040 while (size-- > 0) {
3041 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003042
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003043 /* Escape quotes and backslashes */
3044 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003045 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 *p++ = '\\';
3047 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003048 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003049 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003050
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003051#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003052 /* Map 21-bit characters to '\U00xxxxxx' */
3053 else if (ch >= 0x10000) {
3054 *p++ = '\\';
3055 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003056 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3057 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3058 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3059 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3060 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3061 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3062 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003063 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003064 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003065 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003066#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003067 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3068 else if (ch >= 0xD800 && ch < 0xDC00) {
3069 Py_UNICODE ch2;
3070 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003071
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003072 ch2 = *s++;
3073 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003074 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003075 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3076 *p++ = '\\';
3077 *p++ = 'U';
3078 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3079 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3080 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3081 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3082 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3083 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3084 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3085 *p++ = hexdigit[ucs & 0x0000000F];
3086 continue;
3087 }
3088 /* Fall through: isolated surrogates are copied as-is */
3089 s--;
3090 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003091 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003092#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003093
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003095 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 *p++ = '\\';
3097 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003098 *p++ = hexdigit[(ch >> 12) & 0x000F];
3099 *p++ = hexdigit[(ch >> 8) & 0x000F];
3100 *p++ = hexdigit[(ch >> 4) & 0x000F];
3101 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003103
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003104 /* Map special whitespace to '\t', \n', '\r' */
3105 else if (ch == '\t') {
3106 *p++ = '\\';
3107 *p++ = 't';
3108 }
3109 else if (ch == '\n') {
3110 *p++ = '\\';
3111 *p++ = 'n';
3112 }
3113 else if (ch == '\r') {
3114 *p++ = '\\';
3115 *p++ = 'r';
3116 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003117
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003118 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003119 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003121 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003122 *p++ = hexdigit[(ch >> 4) & 0x000F];
3123 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003124 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003125
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 /* Copy everything else as-is */
3127 else
3128 *p++ = (char) ch;
3129 }
3130 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003131 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132
3133 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003134 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3135 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 return repr;
3137}
3138
3139PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003140 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141{
3142 return unicodeescape_string(s, size, 0);
3143}
3144
3145PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3146{
3147 if (!PyUnicode_Check(unicode)) {
3148 PyErr_BadArgument();
3149 return NULL;
3150 }
3151 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003152 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153}
3154
3155/* --- Raw Unicode Escape Codec ------------------------------------------- */
3156
3157PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003158 Py_ssize_t size,
3159 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003161 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003162 Py_ssize_t startinpos;
3163 Py_ssize_t endinpos;
3164 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003166 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 const char *end;
3168 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003169 PyObject *errorHandler = NULL;
3170 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003171
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 /* Escaped strings will always be longer than the resulting
3173 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003174 length after conversion to the true value. (But decoding error
3175 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 v = _PyUnicode_New(size);
3177 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003180 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003181 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 end = s + size;
3183 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003184 unsigned char c;
3185 Py_UCS4 x;
3186 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003187 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003189 /* Non-escape characters are interpreted as Unicode ordinals */
3190 if (*s != '\\') {
3191 *p++ = (unsigned char)*s++;
3192 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003193 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003194 startinpos = s-starts;
3195
3196 /* \u-escapes are only interpreted iff the number of leading
3197 backslashes if odd */
3198 bs = s;
3199 for (;s < end;) {
3200 if (*s != '\\')
3201 break;
3202 *p++ = (unsigned char)*s++;
3203 }
3204 if (((s - bs) & 1) == 0 ||
3205 s >= end ||
3206 (*s != 'u' && *s != 'U')) {
3207 continue;
3208 }
3209 p--;
3210 count = *s=='u' ? 4 : 8;
3211 s++;
3212
3213 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3214 outpos = p-PyUnicode_AS_UNICODE(v);
3215 for (x = 0, i = 0; i < count; ++i, ++s) {
3216 c = (unsigned char)*s;
3217 if (!isxdigit(c)) {
3218 endinpos = s-starts;
3219 if (unicode_decode_call_errorhandler(
3220 errors, &errorHandler,
3221 "rawunicodeescape", "truncated \\uXXXX",
3222 starts, size, &startinpos, &endinpos, &exc, &s,
3223 &v, &outpos, &p))
3224 goto onError;
3225 goto nextByte;
3226 }
3227 x = (x<<4) & ~0xF;
3228 if (c >= '0' && c <= '9')
3229 x += c - '0';
3230 else if (c >= 'a' && c <= 'f')
3231 x += 10 + c - 'a';
3232 else
3233 x += 10 + c - 'A';
3234 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003235 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003236 /* UCS-2 character */
3237 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003238 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003239 /* UCS-4 character. Either store directly, or as
3240 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003241#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003242 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003243#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003244 x -= 0x10000L;
3245 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3246 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003247#endif
3248 } else {
3249 endinpos = s-starts;
3250 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003251 if (unicode_decode_call_errorhandler(
3252 errors, &errorHandler,
3253 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003254 starts, size, &startinpos, &endinpos, &exc, &s,
3255 &v, &outpos, &p))
3256 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003257 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003258 nextByte:
3259 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003261 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003262 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003266
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003267 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003269 Py_XDECREF(errorHandler);
3270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 return NULL;
3272}
3273
3274PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003275 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276{
3277 PyObject *repr;
3278 char *p;
3279 char *q;
3280
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003281 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003282#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003283 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003284#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003285 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003286#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003287
Neal Norwitze7d8be82008-07-31 17:17:14 +00003288 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003289 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003290
Neal Norwitze7d8be82008-07-31 17:17:14 +00003291 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 if (repr == NULL)
3293 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003294 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003295 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003297 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 while (size-- > 0) {
3299 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003300#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003301 /* Map 32-bit characters to '\Uxxxxxxxx' */
3302 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003303 *p++ = '\\';
3304 *p++ = 'U';
3305 *p++ = hexdigit[(ch >> 28) & 0xf];
3306 *p++ = hexdigit[(ch >> 24) & 0xf];
3307 *p++ = hexdigit[(ch >> 20) & 0xf];
3308 *p++ = hexdigit[(ch >> 16) & 0xf];
3309 *p++ = hexdigit[(ch >> 12) & 0xf];
3310 *p++ = hexdigit[(ch >> 8) & 0xf];
3311 *p++ = hexdigit[(ch >> 4) & 0xf];
3312 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003313 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003314 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003315#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003316 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3317 if (ch >= 0xD800 && ch < 0xDC00) {
3318 Py_UNICODE ch2;
3319 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003320
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003321 ch2 = *s++;
3322 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003323 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003324 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3325 *p++ = '\\';
3326 *p++ = 'U';
3327 *p++ = hexdigit[(ucs >> 28) & 0xf];
3328 *p++ = hexdigit[(ucs >> 24) & 0xf];
3329 *p++ = hexdigit[(ucs >> 20) & 0xf];
3330 *p++ = hexdigit[(ucs >> 16) & 0xf];
3331 *p++ = hexdigit[(ucs >> 12) & 0xf];
3332 *p++ = hexdigit[(ucs >> 8) & 0xf];
3333 *p++ = hexdigit[(ucs >> 4) & 0xf];
3334 *p++ = hexdigit[ucs & 0xf];
3335 continue;
3336 }
3337 /* Fall through: isolated surrogates are copied as-is */
3338 s--;
3339 size++;
3340 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003341#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003342 /* Map 16-bit characters to '\uxxxx' */
3343 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 *p++ = '\\';
3345 *p++ = 'u';
3346 *p++ = hexdigit[(ch >> 12) & 0xf];
3347 *p++ = hexdigit[(ch >> 8) & 0xf];
3348 *p++ = hexdigit[(ch >> 4) & 0xf];
3349 *p++ = hexdigit[ch & 15];
3350 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003351 /* Copy everything else as-is */
3352 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 *p++ = (char) ch;
3354 }
3355 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003356 if (_PyString_Resize(&repr, p - q))
3357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 return repr;
3359}
3360
3361PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3362{
3363 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003364 PyErr_BadArgument();
3365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 }
3367 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003368 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369}
3370
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003371/* --- Unicode Internal Codec ------------------------------------------- */
3372
3373PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003374 Py_ssize_t size,
3375 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003376{
3377 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003378 Py_ssize_t startinpos;
3379 Py_ssize_t endinpos;
3380 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003381 PyUnicodeObject *v;
3382 Py_UNICODE *p;
3383 const char *end;
3384 const char *reason;
3385 PyObject *errorHandler = NULL;
3386 PyObject *exc = NULL;
3387
Neal Norwitzd43069c2006-01-08 01:12:10 +00003388#ifdef Py_UNICODE_WIDE
3389 Py_UNICODE unimax = PyUnicode_GetMax();
3390#endif
3391
Armin Rigo7ccbca92006-10-04 12:17:45 +00003392 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003393 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3394 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003395 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003396 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003397 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003398 p = PyUnicode_AS_UNICODE(v);
3399 end = s + size;
3400
3401 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003402 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003403 /* We have to sanity check the raw data, otherwise doom looms for
3404 some malformed UCS-4 data. */
3405 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003406#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003407 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003408#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003409 end-s < Py_UNICODE_SIZE
3410 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003411 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003412 startinpos = s - starts;
3413 if (end-s < Py_UNICODE_SIZE) {
3414 endinpos = end-starts;
3415 reason = "truncated input";
3416 }
3417 else {
3418 endinpos = s - starts + Py_UNICODE_SIZE;
3419 reason = "illegal code point (> 0x10FFFF)";
3420 }
3421 outpos = p - PyUnicode_AS_UNICODE(v);
3422 if (unicode_decode_call_errorhandler(
3423 errors, &errorHandler,
3424 "unicode_internal", reason,
3425 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003426 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003427 goto onError;
3428 }
3429 }
3430 else {
3431 p++;
3432 s += Py_UNICODE_SIZE;
3433 }
3434 }
3435
Martin v. Löwis412fb672006-04-13 06:34:32 +00003436 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003437 goto onError;
3438 Py_XDECREF(errorHandler);
3439 Py_XDECREF(exc);
3440 return (PyObject *)v;
3441
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003442 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003443 Py_XDECREF(v);
3444 Py_XDECREF(errorHandler);
3445 Py_XDECREF(exc);
3446 return NULL;
3447}
3448
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449/* --- Latin-1 Codec ------------------------------------------------------ */
3450
3451PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003452 Py_ssize_t size,
3453 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454{
3455 PyUnicodeObject *v;
3456 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003457
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003459 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003460 Py_UNICODE r = *(unsigned char*)s;
3461 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003462 }
3463
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 v = _PyUnicode_New(size);
3465 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003466 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003468 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 p = PyUnicode_AS_UNICODE(v);
3470 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003471 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003473
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003474 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 Py_XDECREF(v);
3476 return NULL;
3477}
3478
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479/* create or adjust a UnicodeEncodeError */
3480static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003481 const char *encoding,
3482 const Py_UNICODE *unicode, Py_ssize_t size,
3483 Py_ssize_t startpos, Py_ssize_t endpos,
3484 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003487 *exceptionObject = PyUnicodeEncodeError_Create(
3488 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 }
3490 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003491 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3492 goto onError;
3493 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3494 goto onError;
3495 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3496 goto onError;
3497 return;
3498 onError:
3499 Py_DECREF(*exceptionObject);
3500 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 }
3502}
3503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504/* raises a UnicodeEncodeError */
3505static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003506 const char *encoding,
3507 const Py_UNICODE *unicode, Py_ssize_t size,
3508 Py_ssize_t startpos, Py_ssize_t endpos,
3509 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510{
3511 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003512 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003514 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515}
3516
3517/* error handling callback helper:
3518 build arguments, call the callback and check the arguments,
3519 put the result into newpos and return the replacement string, which
3520 has to be freed by the caller */
3521static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003522 PyObject **errorHandler,
3523 const char *encoding, const char *reason,
3524 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3525 Py_ssize_t startpos, Py_ssize_t endpos,
3526 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003528 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529
3530 PyObject *restuple;
3531 PyObject *resunicode;
3532
3533 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003534 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 }
3538
3539 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003540 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543
3544 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003545 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003547 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003549 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003550 Py_DECREF(restuple);
3551 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 }
3553 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003554 &resunicode, newpos)) {
3555 Py_DECREF(restuple);
3556 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 }
3558 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003559 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003560 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003561 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3562 Py_DECREF(restuple);
3563 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003564 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 Py_INCREF(resunicode);
3566 Py_DECREF(restuple);
3567 return resunicode;
3568}
3569
3570static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003571 Py_ssize_t size,
3572 const char *errors,
3573 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574{
3575 /* output object */
3576 PyObject *res;
3577 /* pointers to the beginning and end+1 of input */
3578 const Py_UNICODE *startp = p;
3579 const Py_UNICODE *endp = p + size;
3580 /* pointer to the beginning of the unencodable characters */
3581 /* const Py_UNICODE *badp = NULL; */
3582 /* pointer into the output */
3583 char *str;
3584 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003585 Py_ssize_t respos = 0;
3586 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003587 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3588 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 PyObject *errorHandler = NULL;
3590 PyObject *exc = NULL;
3591 /* the following variable is used for caching string comparisons
3592 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3593 int known_errorHandler = -1;
3594
3595 /* allocate enough for a simple encoding without
3596 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003597 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 if (res == NULL)
3599 goto onError;
3600 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003601 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003602 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 ressize = size;
3604
3605 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003606 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003608 /* can we encode this? */
3609 if (c<limit) {
3610 /* no overflow check, because we know that the space is enough */
3611 *str++ = (char)c;
3612 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003613 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003614 else {
3615 Py_ssize_t unicodepos = p-startp;
3616 Py_ssize_t requiredsize;
3617 PyObject *repunicode;
3618 Py_ssize_t repsize;
3619 Py_ssize_t newpos;
3620 Py_ssize_t respos;
3621 Py_UNICODE *uni2;
3622 /* startpos for collecting unencodable chars */
3623 const Py_UNICODE *collstart = p;
3624 const Py_UNICODE *collend = p;
3625 /* find all unecodable characters */
3626 while ((collend < endp) && ((*collend)>=limit))
3627 ++collend;
3628 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3629 if (known_errorHandler==-1) {
3630 if ((errors==NULL) || (!strcmp(errors, "strict")))
3631 known_errorHandler = 1;
3632 else if (!strcmp(errors, "replace"))
3633 known_errorHandler = 2;
3634 else if (!strcmp(errors, "ignore"))
3635 known_errorHandler = 3;
3636 else if (!strcmp(errors, "xmlcharrefreplace"))
3637 known_errorHandler = 4;
3638 else
3639 known_errorHandler = 0;
3640 }
3641 switch (known_errorHandler) {
3642 case 1: /* strict */
3643 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3644 goto onError;
3645 case 2: /* replace */
3646 while (collstart++<collend)
3647 *str++ = '?'; /* fall through */
3648 case 3: /* ignore */
3649 p = collend;
3650 break;
3651 case 4: /* xmlcharrefreplace */
3652 respos = str-PyString_AS_STRING(res);
3653 /* determine replacement size (temporarily (mis)uses p) */
3654 for (p = collstart, repsize = 0; p < collend; ++p) {
3655 if (*p<10)
3656 repsize += 2+1+1;
3657 else if (*p<100)
3658 repsize += 2+2+1;
3659 else if (*p<1000)
3660 repsize += 2+3+1;
3661 else if (*p<10000)
3662 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003663#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003664 else
3665 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003666#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003667 else if (*p<100000)
3668 repsize += 2+5+1;
3669 else if (*p<1000000)
3670 repsize += 2+6+1;
3671 else
3672 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003673#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003674 }
3675 requiredsize = respos+repsize+(endp-collend);
3676 if (requiredsize > ressize) {
3677 if (requiredsize<2*ressize)
3678 requiredsize = 2*ressize;
3679 if (_PyString_Resize(&res, requiredsize))
3680 goto onError;
3681 str = PyString_AS_STRING(res) + respos;
3682 ressize = requiredsize;
3683 }
3684 /* generate replacement (temporarily (mis)uses p) */
3685 for (p = collstart; p < collend; ++p) {
3686 str += sprintf(str, "&#%d;", (int)*p);
3687 }
3688 p = collend;
3689 break;
3690 default:
3691 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3692 encoding, reason, startp, size, &exc,
3693 collstart-startp, collend-startp, &newpos);
3694 if (repunicode == NULL)
3695 goto onError;
3696 /* need more space? (at least enough for what we have+the
3697 replacement+the rest of the string, so we won't have to
3698 check space for encodable characters) */
3699 respos = str-PyString_AS_STRING(res);
3700 repsize = PyUnicode_GET_SIZE(repunicode);
3701 requiredsize = respos+repsize+(endp-collend);
3702 if (requiredsize > ressize) {
3703 if (requiredsize<2*ressize)
3704 requiredsize = 2*ressize;
3705 if (_PyString_Resize(&res, requiredsize)) {
3706 Py_DECREF(repunicode);
3707 goto onError;
3708 }
3709 str = PyString_AS_STRING(res) + respos;
3710 ressize = requiredsize;
3711 }
3712 /* check if there is anything unencodable in the replacement
3713 and copy it to the output */
3714 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3715 c = *uni2;
3716 if (c >= limit) {
3717 raise_encode_exception(&exc, encoding, startp, size,
3718 unicodepos, unicodepos+1, reason);
3719 Py_DECREF(repunicode);
3720 goto onError;
3721 }
3722 *str = (char)c;
3723 }
3724 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003725 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003726 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003727 }
3728 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003730 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003732 /* If this falls res will be NULL */
3733 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003734 Py_XDECREF(errorHandler);
3735 Py_XDECREF(exc);
3736 return res;
3737
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003738 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 Py_XDECREF(res);
3740 Py_XDECREF(errorHandler);
3741 Py_XDECREF(exc);
3742 return NULL;
3743}
3744
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003746 Py_ssize_t size,
3747 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750}
3751
3752PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3753{
3754 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003755 PyErr_BadArgument();
3756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 }
3758 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003759 PyUnicode_GET_SIZE(unicode),
3760 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761}
3762
3763/* --- 7-bit ASCII Codec -------------------------------------------------- */
3764
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003766 Py_ssize_t size,
3767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003769 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 PyUnicodeObject *v;
3771 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003772 Py_ssize_t startinpos;
3773 Py_ssize_t endinpos;
3774 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 const char *e;
3776 PyObject *errorHandler = NULL;
3777 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003778
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003780 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003781 Py_UNICODE r = *(unsigned char*)s;
3782 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003783 }
Tim Petersced69f82003-09-16 20:30:58 +00003784
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 v = _PyUnicode_New(size);
3786 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003787 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003789 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791 e = s + size;
3792 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003793 register unsigned char c = (unsigned char)*s;
3794 if (c < 128) {
3795 *p++ = c;
3796 ++s;
3797 }
3798 else {
3799 startinpos = s-starts;
3800 endinpos = startinpos + 1;
3801 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3802 if (unicode_decode_call_errorhandler(
3803 errors, &errorHandler,
3804 "ascii", "ordinal not in range(128)",
3805 starts, size, &startinpos, &endinpos, &exc, &s,
3806 &v, &outpos, &p))
3807 goto onError;
3808 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003810 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003811 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3812 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813 Py_XDECREF(errorHandler);
3814 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003816
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003817 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 Py_XDECREF(errorHandler);
3820 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 return NULL;
3822}
3823
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003825 Py_ssize_t size,
3826 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829}
3830
3831PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3832{
3833 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003834 PyErr_BadArgument();
3835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 }
3837 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003838 PyUnicode_GET_SIZE(unicode),
3839 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840}
3841
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003842#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003843
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003844/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003845
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003846#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003847#define NEED_RETRY
3848#endif
3849
3850/* XXX This code is limited to "true" double-byte encodings, as
3851 a) it assumes an incomplete character consists of a single byte, and
3852 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003853 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003854
3855static int is_dbcs_lead_byte(const char *s, int offset)
3856{
3857 const char *curr = s + offset;
3858
3859 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003860 const char *prev = CharPrev(s, curr);
3861 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003862 }
3863 return 0;
3864}
3865
3866/*
3867 * Decode MBCS string into unicode object. If 'final' is set, converts
3868 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3869 */
3870static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003871 const char *s, /* MBCS string */
3872 int size, /* sizeof MBCS string */
3873 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003874{
3875 Py_UNICODE *p;
3876 Py_ssize_t n = 0;
3877 int usize = 0;
3878
3879 assert(size >= 0);
3880
3881 /* Skip trailing lead-byte unless 'final' is set */
3882 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003883 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003884
3885 /* First get the size of the result */
3886 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003887 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3888 if (usize == 0) {
3889 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3890 return -1;
3891 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003892 }
3893
3894 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003895 /* Create unicode object */
3896 *v = _PyUnicode_New(usize);
3897 if (*v == NULL)
3898 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003899 }
3900 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003901 /* Extend unicode object */
3902 n = PyUnicode_GET_SIZE(*v);
3903 if (_PyUnicode_Resize(v, n + usize) < 0)
3904 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003905 }
3906
3907 /* Do the conversion */
3908 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003909 p = PyUnicode_AS_UNICODE(*v) + n;
3910 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3911 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3912 return -1;
3913 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003914 }
3915
3916 return size;
3917}
3918
3919PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003920 Py_ssize_t size,
3921 const char *errors,
3922 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003923{
3924 PyUnicodeObject *v = NULL;
3925 int done;
3926
3927 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003928 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003929
3930#ifdef NEED_RETRY
3931 retry:
3932 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003933 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003934 else
3935#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003936 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003937
3938 if (done < 0) {
3939 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003940 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003941 }
3942
3943 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003944 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003945
3946#ifdef NEED_RETRY
3947 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003948 s += done;
3949 size -= done;
3950 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003951 }
3952#endif
3953
3954 return (PyObject *)v;
3955}
3956
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003957PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003958 Py_ssize_t size,
3959 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003960{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003961 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3962}
3963
3964/*
3965 * Convert unicode into string object (MBCS).
3966 * Returns 0 if succeed, -1 otherwise.
3967 */
3968static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003969 const Py_UNICODE *p, /* unicode */
3970 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003971{
3972 int mbcssize = 0;
3973 Py_ssize_t n = 0;
3974
3975 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003976
3977 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003978 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003979 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3980 if (mbcssize == 0) {
3981 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3982 return -1;
3983 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003984 }
3985
Martin v. Löwisd8251432006-06-14 05:21:04 +00003986 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003987 /* Create string object */
3988 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3989 if (*repr == NULL)
3990 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003991 }
3992 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003993 /* Extend string object */
3994 n = PyString_Size(*repr);
3995 if (_PyString_Resize(repr, n + mbcssize) < 0)
3996 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003997 }
3998
3999 /* Do the conversion */
4000 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004001 char *s = PyString_AS_STRING(*repr) + n;
4002 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4003 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4004 return -1;
4005 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004006 }
4007
4008 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004009}
4010
4011PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004012 Py_ssize_t size,
4013 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004014{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004015 PyObject *repr = NULL;
4016 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004017
Martin v. Löwisd8251432006-06-14 05:21:04 +00004018#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004019 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004020 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004021 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004022 else
4023#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004024 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004025
Martin v. Löwisd8251432006-06-14 05:21:04 +00004026 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004027 Py_XDECREF(repr);
4028 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004029 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004030
4031#ifdef NEED_RETRY
4032 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004033 p += INT_MAX;
4034 size -= INT_MAX;
4035 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004036 }
4037#endif
4038
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004039 return repr;
4040}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004041
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004042PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4043{
4044 if (!PyUnicode_Check(unicode)) {
4045 PyErr_BadArgument();
4046 return NULL;
4047 }
4048 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004049 PyUnicode_GET_SIZE(unicode),
4050 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004051}
4052
Martin v. Löwisd8251432006-06-14 05:21:04 +00004053#undef NEED_RETRY
4054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004055#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004056
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057/* --- Character Mapping Codec -------------------------------------------- */
4058
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004060 Py_ssize_t size,
4061 PyObject *mapping,
4062 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 Py_ssize_t startinpos;
4066 Py_ssize_t endinpos;
4067 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 PyUnicodeObject *v;
4070 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004071 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072 PyObject *errorHandler = NULL;
4073 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004074 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004075 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004076
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 /* Default to Latin-1 */
4078 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004079 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080
4081 v = _PyUnicode_New(size);
4082 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004083 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004085 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004088 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004089 mapstring = PyUnicode_AS_UNICODE(mapping);
4090 maplen = PyUnicode_GET_SIZE(mapping);
4091 while (s < e) {
4092 unsigned char ch = *s;
4093 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004095 if (ch < maplen)
4096 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004098 if (x == 0xfffe) {
4099 /* undefined mapping */
4100 outpos = p-PyUnicode_AS_UNICODE(v);
4101 startinpos = s-starts;
4102 endinpos = startinpos+1;
4103 if (unicode_decode_call_errorhandler(
4104 errors, &errorHandler,
4105 "charmap", "character maps to <undefined>",
4106 starts, size, &startinpos, &endinpos, &exc, &s,
4107 &v, &outpos, &p)) {
4108 goto onError;
4109 }
4110 continue;
4111 }
4112 *p++ = x;
4113 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004114 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004115 }
4116 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004117 while (s < e) {
4118 unsigned char ch = *s;
4119 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004120
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004121 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4122 w = PyInt_FromLong((long)ch);
4123 if (w == NULL)
4124 goto onError;
4125 x = PyObject_GetItem(mapping, w);
4126 Py_DECREF(w);
4127 if (x == NULL) {
4128 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4129 /* No mapping found means: mapping is undefined. */
4130 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004131 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004132 } else
4133 goto onError;
4134 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004135
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004136 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004137 if (x == Py_None)
4138 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004139 if (PyInt_Check(x)) {
4140 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004141 if (value == 0xFFFE)
4142 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004143 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004144 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004145 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004146 Py_DECREF(x);
4147 goto onError;
4148 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004149
4150#ifndef Py_UNICODE_WIDE
4151 if (value > 0xFFFF) {
4152 /* see the code for 1-n mapping below */
4153 if (extrachars < 2) {
4154 /* resize first */
4155 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4156 Py_ssize_t needed = 10 - extrachars;
4157 extrachars += needed;
4158 /* XXX overflow detection missing */
4159 if (_PyUnicode_Resize(&v,
4160 PyUnicode_GET_SIZE(v) + needed) < 0) {
4161 Py_DECREF(x);
4162 goto onError;
4163 }
4164 p = PyUnicode_AS_UNICODE(v) + oldpos;
4165 }
4166 value -= 0x10000;
4167 *p++ = 0xD800 | (value >> 10);
4168 *p++ = 0xDC00 | (value & 0x3FF);
4169 extrachars -= 2;
4170 }
4171 else
4172#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004173 *p++ = (Py_UNICODE)value;
4174 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004175 else if (PyUnicode_Check(x)) {
4176 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004177
Serhiy Storchaka95997452013-01-15 14:42:59 +02004178 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004179 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004180 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4181 if (value == 0xFFFE)
4182 goto Undefined;
4183 *p++ = value;
4184 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004185 else if (targetsize > 1) {
4186 /* 1-n mapping */
4187 if (targetsize > extrachars) {
4188 /* resize first */
4189 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4190 Py_ssize_t needed = (targetsize - extrachars) + \
4191 (targetsize << 2);
4192 extrachars += needed;
4193 /* XXX overflow detection missing */
4194 if (_PyUnicode_Resize(&v,
4195 PyUnicode_GET_SIZE(v) + needed) < 0) {
4196 Py_DECREF(x);
4197 goto onError;
4198 }
4199 p = PyUnicode_AS_UNICODE(v) + oldpos;
4200 }
4201 Py_UNICODE_COPY(p,
4202 PyUnicode_AS_UNICODE(x),
4203 targetsize);
4204 p += targetsize;
4205 extrachars -= targetsize;
4206 }
4207 /* 1-0 mapping: skip the character */
4208 }
4209 else {
4210 /* wrong return value */
4211 PyErr_SetString(PyExc_TypeError,
4212 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004213 Py_DECREF(x);
4214 goto onError;
4215 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004216 Py_DECREF(x);
4217 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004218 continue;
4219Undefined:
4220 /* undefined mapping */
4221 Py_XDECREF(x);
4222 outpos = p-PyUnicode_AS_UNICODE(v);
4223 startinpos = s-starts;
4224 endinpos = startinpos+1;
4225 if (unicode_decode_call_errorhandler(
4226 errors, &errorHandler,
4227 "charmap", "character maps to <undefined>",
4228 starts, size, &startinpos, &endinpos, &exc, &s,
4229 &v, &outpos, &p)) {
4230 goto onError;
4231 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 }
4234 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004235 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4236 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237 Py_XDECREF(errorHandler);
4238 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004239 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004241 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 Py_XDECREF(errorHandler);
4243 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004244 Py_XDECREF(v);
4245 return NULL;
4246}
4247
Martin v. Löwis3f767792006-06-04 19:36:28 +00004248/* Charmap encoding: the lookup table */
4249
4250struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004251 PyObject_HEAD
4252 unsigned char level1[32];
4253 int count2, count3;
4254 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004255};
4256
4257static PyObject*
4258encoding_map_size(PyObject *obj, PyObject* args)
4259{
4260 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004261 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004262 128*map->count3);
4263}
4264
4265static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004266 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004267 PyDoc_STR("Return the size (in bytes) of this object") },
4268 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004269};
4270
4271static void
4272encoding_map_dealloc(PyObject* o)
4273{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004274 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004275}
4276
4277static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004278 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004279 "EncodingMap", /*tp_name*/
4280 sizeof(struct encoding_map), /*tp_basicsize*/
4281 0, /*tp_itemsize*/
4282 /* methods */
4283 encoding_map_dealloc, /*tp_dealloc*/
4284 0, /*tp_print*/
4285 0, /*tp_getattr*/
4286 0, /*tp_setattr*/
4287 0, /*tp_compare*/
4288 0, /*tp_repr*/
4289 0, /*tp_as_number*/
4290 0, /*tp_as_sequence*/
4291 0, /*tp_as_mapping*/
4292 0, /*tp_hash*/
4293 0, /*tp_call*/
4294 0, /*tp_str*/
4295 0, /*tp_getattro*/
4296 0, /*tp_setattro*/
4297 0, /*tp_as_buffer*/
4298 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4299 0, /*tp_doc*/
4300 0, /*tp_traverse*/
4301 0, /*tp_clear*/
4302 0, /*tp_richcompare*/
4303 0, /*tp_weaklistoffset*/
4304 0, /*tp_iter*/
4305 0, /*tp_iternext*/
4306 encoding_map_methods, /*tp_methods*/
4307 0, /*tp_members*/
4308 0, /*tp_getset*/
4309 0, /*tp_base*/
4310 0, /*tp_dict*/
4311 0, /*tp_descr_get*/
4312 0, /*tp_descr_set*/
4313 0, /*tp_dictoffset*/
4314 0, /*tp_init*/
4315 0, /*tp_alloc*/
4316 0, /*tp_new*/
4317 0, /*tp_free*/
4318 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004319};
4320
4321PyObject*
4322PyUnicode_BuildEncodingMap(PyObject* string)
4323{
4324 Py_UNICODE *decode;
4325 PyObject *result;
4326 struct encoding_map *mresult;
4327 int i;
4328 int need_dict = 0;
4329 unsigned char level1[32];
4330 unsigned char level2[512];
4331 unsigned char *mlevel1, *mlevel2, *mlevel3;
4332 int count2 = 0, count3 = 0;
4333
4334 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4335 PyErr_BadArgument();
4336 return NULL;
4337 }
4338 decode = PyUnicode_AS_UNICODE(string);
4339 memset(level1, 0xFF, sizeof level1);
4340 memset(level2, 0xFF, sizeof level2);
4341
4342 /* If there isn't a one-to-one mapping of NULL to \0,
4343 or if there are non-BMP characters, we need to use
4344 a mapping dictionary. */
4345 if (decode[0] != 0)
4346 need_dict = 1;
4347 for (i = 1; i < 256; i++) {
4348 int l1, l2;
4349 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004350#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004351 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004352#endif
4353 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004354 need_dict = 1;
4355 break;
4356 }
4357 if (decode[i] == 0xFFFE)
4358 /* unmapped character */
4359 continue;
4360 l1 = decode[i] >> 11;
4361 l2 = decode[i] >> 7;
4362 if (level1[l1] == 0xFF)
4363 level1[l1] = count2++;
4364 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004365 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004366 }
4367
4368 if (count2 >= 0xFF || count3 >= 0xFF)
4369 need_dict = 1;
4370
4371 if (need_dict) {
4372 PyObject *result = PyDict_New();
4373 PyObject *key, *value;
4374 if (!result)
4375 return NULL;
4376 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004377 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004378 key = PyInt_FromLong(decode[i]);
4379 value = PyInt_FromLong(i);
4380 if (!key || !value)
4381 goto failed1;
4382 if (PyDict_SetItem(result, key, value) == -1)
4383 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004384 Py_DECREF(key);
4385 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004386 }
4387 return result;
4388 failed1:
4389 Py_XDECREF(key);
4390 Py_XDECREF(value);
4391 Py_DECREF(result);
4392 return NULL;
4393 }
4394
4395 /* Create a three-level trie */
4396 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4397 16*count2 + 128*count3 - 1);
4398 if (!result)
4399 return PyErr_NoMemory();
4400 PyObject_Init(result, &EncodingMapType);
4401 mresult = (struct encoding_map*)result;
4402 mresult->count2 = count2;
4403 mresult->count3 = count3;
4404 mlevel1 = mresult->level1;
4405 mlevel2 = mresult->level23;
4406 mlevel3 = mresult->level23 + 16*count2;
4407 memcpy(mlevel1, level1, 32);
4408 memset(mlevel2, 0xFF, 16*count2);
4409 memset(mlevel3, 0, 128*count3);
4410 count3 = 0;
4411 for (i = 1; i < 256; i++) {
4412 int o1, o2, o3, i2, i3;
4413 if (decode[i] == 0xFFFE)
4414 /* unmapped character */
4415 continue;
4416 o1 = decode[i]>>11;
4417 o2 = (decode[i]>>7) & 0xF;
4418 i2 = 16*mlevel1[o1] + o2;
4419 if (mlevel2[i2] == 0xFF)
4420 mlevel2[i2] = count3++;
4421 o3 = decode[i] & 0x7F;
4422 i3 = 128*mlevel2[i2] + o3;
4423 mlevel3[i3] = i;
4424 }
4425 return result;
4426}
4427
4428static int
4429encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4430{
4431 struct encoding_map *map = (struct encoding_map*)mapping;
4432 int l1 = c>>11;
4433 int l2 = (c>>7) & 0xF;
4434 int l3 = c & 0x7F;
4435 int i;
4436
4437#ifdef Py_UNICODE_WIDE
4438 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004439 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004440 }
4441#endif
4442 if (c == 0)
4443 return 0;
4444 /* level 1*/
4445 i = map->level1[l1];
4446 if (i == 0xFF) {
4447 return -1;
4448 }
4449 /* level 2*/
4450 i = map->level23[16*i+l2];
4451 if (i == 0xFF) {
4452 return -1;
4453 }
4454 /* level 3 */
4455 i = map->level23[16*map->count2 + 128*i + l3];
4456 if (i == 0) {
4457 return -1;
4458 }
4459 return i;
4460}
4461
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462/* Lookup the character ch in the mapping. If the character
4463 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004464 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 PyObject *w = PyInt_FromLong((long)c);
4468 PyObject *x;
4469
4470 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004471 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 x = PyObject_GetItem(mapping, w);
4473 Py_DECREF(w);
4474 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004475 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4476 /* No mapping found means: mapping is undefined. */
4477 PyErr_Clear();
4478 x = Py_None;
4479 Py_INCREF(x);
4480 return x;
4481 } else
4482 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004484 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004485 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004487 long value = PyInt_AS_LONG(x);
4488 if (value < 0 || value > 255) {
4489 PyErr_SetString(PyExc_TypeError,
4490 "character mapping must be in range(256)");
4491 Py_DECREF(x);
4492 return NULL;
4493 }
4494 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004496 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004497 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004499 /* wrong return value */
4500 PyErr_SetString(PyExc_TypeError,
4501 "character mapping must return integer, None or str");
4502 Py_DECREF(x);
4503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 }
4505}
4506
Martin v. Löwis3f767792006-06-04 19:36:28 +00004507static int
4508charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4509{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004510 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4511 /* exponentially overallocate to minimize reallocations */
4512 if (requiredsize < 2*outsize)
4513 requiredsize = 2*outsize;
4514 if (_PyString_Resize(outobj, requiredsize)) {
4515 return 0;
4516 }
4517 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004518}
4519
Benjamin Peterson857ce152009-01-31 16:29:18 +00004520typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004521 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004522}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523/* lookup the character, put the result in the output string and adjust
4524 various state variables. Reallocate the output string if not enough
4525 space is available. Return a new reference to the object that
4526 was put in the output buffer, or Py_None, if the mapping was undefined
4527 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004528 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004530charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004531 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004533 PyObject *rep;
4534 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004535 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536
Christian Heimese93237d2007-12-19 02:37:44 +00004537 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004538 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004539 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004540 if (res == -1)
4541 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004542 if (outsize<requiredsize)
4543 if (!charmapencode_resize(outobj, outpos, requiredsize))
4544 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004545 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004546 outstart[(*outpos)++] = (char)res;
4547 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004548 }
4549
4550 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004552 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004553 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004554 Py_DECREF(rep);
4555 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004556 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004557 if (PyInt_Check(rep)) {
4558 Py_ssize_t requiredsize = *outpos+1;
4559 if (outsize<requiredsize)
4560 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4561 Py_DECREF(rep);
4562 return enc_EXCEPTION;
4563 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004564 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004565 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004566 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004567 else {
4568 const char *repchars = PyString_AS_STRING(rep);
4569 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4570 Py_ssize_t requiredsize = *outpos+repsize;
4571 if (outsize<requiredsize)
4572 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4573 Py_DECREF(rep);
4574 return enc_EXCEPTION;
4575 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004576 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004577 memcpy(outstart + *outpos, repchars, repsize);
4578 *outpos += repsize;
4579 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 }
Georg Brandl9f167602006-06-04 21:46:16 +00004581 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004582 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583}
4584
4585/* handle an error in PyUnicode_EncodeCharmap
4586 Return 0 on success, -1 on error */
4587static
4588int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004589 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004591 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004592 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593{
4594 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004595 Py_ssize_t repsize;
4596 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004597 Py_UNICODE *uni2;
4598 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004599 Py_ssize_t collstartpos = *inpos;
4600 Py_ssize_t collendpos = *inpos+1;
4601 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 char *encoding = "charmap";
4603 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004604 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 /* find all unencodable characters */
4607 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004608 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004609 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004610 int res = encoding_map_lookup(p[collendpos], mapping);
4611 if (res != -1)
4612 break;
4613 ++collendpos;
4614 continue;
4615 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004616
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004617 rep = charmapencode_lookup(p[collendpos], mapping);
4618 if (rep==NULL)
4619 return -1;
4620 else if (rep!=Py_None) {
4621 Py_DECREF(rep);
4622 break;
4623 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004624 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004625 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 }
4627 /* cache callback name lookup
4628 * (if not done yet, i.e. it's the first error) */
4629 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004630 if ((errors==NULL) || (!strcmp(errors, "strict")))
4631 *known_errorHandler = 1;
4632 else if (!strcmp(errors, "replace"))
4633 *known_errorHandler = 2;
4634 else if (!strcmp(errors, "ignore"))
4635 *known_errorHandler = 3;
4636 else if (!strcmp(errors, "xmlcharrefreplace"))
4637 *known_errorHandler = 4;
4638 else
4639 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 }
4641 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004642 case 1: /* strict */
4643 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4644 return -1;
4645 case 2: /* replace */
4646 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004647 x = charmapencode_output('?', mapping, res, respos);
4648 if (x==enc_EXCEPTION) {
4649 return -1;
4650 }
4651 else if (x==enc_FAILED) {
4652 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4653 return -1;
4654 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004655 }
4656 /* fall through */
4657 case 3: /* ignore */
4658 *inpos = collendpos;
4659 break;
4660 case 4: /* xmlcharrefreplace */
4661 /* generate replacement (temporarily (mis)uses p) */
4662 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004663 char buffer[2+29+1+1];
4664 char *cp;
4665 sprintf(buffer, "&#%d;", (int)p[collpos]);
4666 for (cp = buffer; *cp; ++cp) {
4667 x = charmapencode_output(*cp, mapping, res, respos);
4668 if (x==enc_EXCEPTION)
4669 return -1;
4670 else if (x==enc_FAILED) {
4671 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4672 return -1;
4673 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004674 }
4675 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004676 *inpos = collendpos;
4677 break;
4678 default:
4679 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004680 encoding, reason, p, size, exceptionObject,
4681 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004682 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004683 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004684 /* generate replacement */
4685 repsize = PyUnicode_GET_SIZE(repunicode);
4686 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004687 x = charmapencode_output(*uni2, mapping, res, respos);
4688 if (x==enc_EXCEPTION) {
4689 return -1;
4690 }
4691 else if (x==enc_FAILED) {
4692 Py_DECREF(repunicode);
4693 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4694 return -1;
4695 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004696 }
4697 *inpos = newpos;
4698 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 }
4700 return 0;
4701}
4702
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004704 Py_ssize_t size,
4705 PyObject *mapping,
4706 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004708 /* output object */
4709 PyObject *res = NULL;
4710 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004711 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004713 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 PyObject *errorHandler = NULL;
4715 PyObject *exc = NULL;
4716 /* the following variable is used for caching string comparisons
4717 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4718 * 3=ignore, 4=xmlcharrefreplace */
4719 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720
4721 /* Default to Latin-1 */
4722 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004723 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004725 /* allocate enough for a simple encoding without
4726 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004727 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 if (res == NULL)
4729 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004730 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004731 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004734 /* try to encode it */
4735 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4736 if (x==enc_EXCEPTION) /* error */
4737 goto onError;
4738 if (x==enc_FAILED) { /* unencodable character */
4739 if (charmap_encoding_error(p, size, &inpos, mapping,
4740 &exc,
4741 &known_errorHandler, &errorHandler, errors,
4742 &res, &respos)) {
4743 goto onError;
4744 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004745 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004746 else
4747 /* done with this character => adjust input position */
4748 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004752 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004753 if (_PyString_Resize(&res, respos))
4754 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755 }
4756 Py_XDECREF(exc);
4757 Py_XDECREF(errorHandler);
4758 return res;
4759
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004760 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004761 Py_XDECREF(res);
4762 Py_XDECREF(exc);
4763 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 return NULL;
4765}
4766
4767PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004768 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769{
4770 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004771 PyErr_BadArgument();
4772 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773 }
4774 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004775 PyUnicode_GET_SIZE(unicode),
4776 mapping,
4777 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778}
4779
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780/* create or adjust a UnicodeTranslateError */
4781static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004782 const Py_UNICODE *unicode, Py_ssize_t size,
4783 Py_ssize_t startpos, Py_ssize_t endpos,
4784 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004787 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004788 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 }
4790 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004791 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4792 goto onError;
4793 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4794 goto onError;
4795 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4796 goto onError;
4797 return;
4798 onError:
4799 Py_DECREF(*exceptionObject);
4800 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 }
4802}
4803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804/* raises a UnicodeTranslateError */
4805static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004806 const Py_UNICODE *unicode, Py_ssize_t size,
4807 Py_ssize_t startpos, Py_ssize_t endpos,
4808 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809{
4810 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004811 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004813 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814}
4815
4816/* error handling callback helper:
4817 build arguments, call the callback and check the arguments,
4818 put the result into newpos and return the replacement string, which
4819 has to be freed by the caller */
4820static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004821 PyObject **errorHandler,
4822 const char *reason,
4823 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4824 Py_ssize_t startpos, Py_ssize_t endpos,
4825 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004827 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828
Martin v. Löwis412fb672006-04-13 06:34:32 +00004829 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 PyObject *restuple;
4831 PyObject *resunicode;
4832
4833 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004834 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004835 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004836 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837 }
4838
4839 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004840 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004841 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004842 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843
4844 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004845 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004847 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004849 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004850 Py_DECREF(restuple);
4851 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004852 }
4853 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004854 &resunicode, &i_newpos)) {
4855 Py_DECREF(restuple);
4856 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004858 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004859 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004860 else
4861 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004862 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004863 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4864 Py_DECREF(restuple);
4865 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004866 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867 Py_INCREF(resunicode);
4868 Py_DECREF(restuple);
4869 return resunicode;
4870}
4871
4872/* Lookup the character ch in the mapping and put the result in result,
4873 which must be decrefed by the caller.
4874 Return 0 on success, -1 on error */
4875static
4876int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4877{
4878 PyObject *w = PyInt_FromLong((long)c);
4879 PyObject *x;
4880
4881 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004882 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 x = PyObject_GetItem(mapping, w);
4884 Py_DECREF(w);
4885 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004886 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4887 /* No mapping found means: use 1:1 mapping. */
4888 PyErr_Clear();
4889 *result = NULL;
4890 return 0;
4891 } else
4892 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893 }
4894 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004895 *result = x;
4896 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 }
4898 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004899 long value = PyInt_AS_LONG(x);
4900 long max = PyUnicode_GetMax();
4901 if (value < 0 || value > max) {
4902 PyErr_Format(PyExc_TypeError,
4903 "character mapping must be in range(0x%lx)", max+1);
4904 Py_DECREF(x);
4905 return -1;
4906 }
4907 *result = x;
4908 return 0;
4909 }
4910 else if (PyUnicode_Check(x)) {
4911 *result = x;
4912 return 0;
4913 }
4914 else {
4915 /* wrong return value */
4916 PyErr_SetString(PyExc_TypeError,
4917 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004918 Py_DECREF(x);
4919 return -1;
4920 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004921}
4922/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004923 if not reallocate and adjust various state variables.
4924 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925static
Walter Dörwald4894c302003-10-24 14:25:28 +00004926int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004927 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004928{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004929 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004930 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004931 /* remember old output position */
4932 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4933 /* exponentially overallocate to minimize reallocations */
4934 if (requiredsize < 2 * oldsize)
4935 requiredsize = 2 * oldsize;
4936 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4937 return -1;
4938 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004939 }
4940 return 0;
4941}
4942/* lookup the character, put the result in the output string and adjust
4943 various state variables. Return a new reference to the object that
4944 was put in the output buffer in *result, or Py_None, if the mapping was
4945 undefined (in which case no character was written).
4946 The called must decref result.
4947 Return 0 on success, -1 on error. */
4948static
Walter Dörwald4894c302003-10-24 14:25:28 +00004949int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004950 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4951 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952{
Walter Dörwald4894c302003-10-24 14:25:28 +00004953 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004954 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004956 /* not found => default to 1:1 mapping */
4957 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958 }
4959 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004960 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004962 /* no overflow check, because we know that the space is enough */
4963 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 }
4965 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004966 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4967 if (repsize==1) {
4968 /* no overflow check, because we know that the space is enough */
4969 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4970 }
4971 else if (repsize!=0) {
4972 /* more than one character */
4973 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4974 (insize - (curinp-startinp)) +
4975 repsize - 1;
4976 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4977 return -1;
4978 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4979 *outp += repsize;
4980 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004981 }
4982 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004983 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004984 return 0;
4985}
4986
4987PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004988 Py_ssize_t size,
4989 PyObject *mapping,
4990 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 /* output object */
4993 PyObject *res = NULL;
4994 /* pointers to the beginning and end+1 of input */
4995 const Py_UNICODE *startp = p;
4996 const Py_UNICODE *endp = p + size;
4997 /* pointer into the output */
4998 Py_UNICODE *str;
4999 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005000 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005001 char *reason = "character maps to <undefined>";
5002 PyObject *errorHandler = NULL;
5003 PyObject *exc = NULL;
5004 /* the following variable is used for caching string comparisons
5005 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5006 * 3=ignore, 4=xmlcharrefreplace */
5007 int known_errorHandler = -1;
5008
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005010 PyErr_BadArgument();
5011 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005013
5014 /* allocate enough for a simple 1:1 translation without
5015 replacements, if we need more, we'll resize */
5016 res = PyUnicode_FromUnicode(NULL, size);
5017 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005018 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005020 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005021 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005024 /* try to encode it */
5025 PyObject *x = NULL;
5026 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5027 Py_XDECREF(x);
5028 goto onError;
5029 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005030 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005031 if (x!=Py_None) /* it worked => adjust input pointer */
5032 ++p;
5033 else { /* untranslatable character */
5034 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5035 Py_ssize_t repsize;
5036 Py_ssize_t newpos;
5037 Py_UNICODE *uni2;
5038 /* startpos for collecting untranslatable chars */
5039 const Py_UNICODE *collstart = p;
5040 const Py_UNICODE *collend = p+1;
5041 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005043 /* find all untranslatable characters */
5044 while (collend < endp) {
5045 if (charmaptranslate_lookup(*collend, mapping, &x))
5046 goto onError;
5047 Py_XDECREF(x);
5048 if (x!=Py_None)
5049 break;
5050 ++collend;
5051 }
5052 /* cache callback name lookup
5053 * (if not done yet, i.e. it's the first error) */
5054 if (known_errorHandler==-1) {
5055 if ((errors==NULL) || (!strcmp(errors, "strict")))
5056 known_errorHandler = 1;
5057 else if (!strcmp(errors, "replace"))
5058 known_errorHandler = 2;
5059 else if (!strcmp(errors, "ignore"))
5060 known_errorHandler = 3;
5061 else if (!strcmp(errors, "xmlcharrefreplace"))
5062 known_errorHandler = 4;
5063 else
5064 known_errorHandler = 0;
5065 }
5066 switch (known_errorHandler) {
5067 case 1: /* strict */
5068 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005069 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005070 case 2: /* replace */
5071 /* No need to check for space, this is a 1:1 replacement */
5072 for (coll = collstart; coll<collend; ++coll)
5073 *str++ = '?';
5074 /* fall through */
5075 case 3: /* ignore */
5076 p = collend;
5077 break;
5078 case 4: /* xmlcharrefreplace */
5079 /* generate replacement (temporarily (mis)uses p) */
5080 for (p = collstart; p < collend; ++p) {
5081 char buffer[2+29+1+1];
5082 char *cp;
5083 sprintf(buffer, "&#%d;", (int)*p);
5084 if (charmaptranslate_makespace(&res, &str,
5085 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5086 goto onError;
5087 for (cp = buffer; *cp; ++cp)
5088 *str++ = *cp;
5089 }
5090 p = collend;
5091 break;
5092 default:
5093 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5094 reason, startp, size, &exc,
5095 collstart-startp, collend-startp, &newpos);
5096 if (repunicode == NULL)
5097 goto onError;
5098 /* generate replacement */
5099 repsize = PyUnicode_GET_SIZE(repunicode);
5100 if (charmaptranslate_makespace(&res, &str,
5101 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5102 Py_DECREF(repunicode);
5103 goto onError;
5104 }
5105 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5106 *str++ = *uni2;
5107 p = startp + newpos;
5108 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005109 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005110 }
5111 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112 /* Resize if we allocated to much */
5113 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005114 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005115 if (PyUnicode_Resize(&res, respos) < 0)
5116 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005117 }
5118 Py_XDECREF(exc);
5119 Py_XDECREF(errorHandler);
5120 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005122 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005123 Py_XDECREF(res);
5124 Py_XDECREF(exc);
5125 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126 return NULL;
5127}
5128
5129PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005130 PyObject *mapping,
5131 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132{
5133 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005134
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 str = PyUnicode_FromObject(str);
5136 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005137 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005139 PyUnicode_GET_SIZE(str),
5140 mapping,
5141 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 Py_DECREF(str);
5143 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005144
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005145 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 Py_XDECREF(str);
5147 return NULL;
5148}
Tim Petersced69f82003-09-16 20:30:58 +00005149
Guido van Rossum9e896b32000-04-05 20:11:21 +00005150/* --- Decimal Encoder ---------------------------------------------------- */
5151
5152int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005153 Py_ssize_t length,
5154 char *output,
5155 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005156{
5157 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005158 PyObject *errorHandler = NULL;
5159 PyObject *exc = NULL;
5160 const char *encoding = "decimal";
5161 const char *reason = "invalid decimal Unicode string";
5162 /* the following variable is used for caching string comparisons
5163 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5164 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005165
5166 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005167 PyErr_BadArgument();
5168 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005169 }
5170
5171 p = s;
5172 end = s + length;
5173 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005174 register Py_UNICODE ch = *p;
5175 int decimal;
5176 PyObject *repunicode;
5177 Py_ssize_t repsize;
5178 Py_ssize_t newpos;
5179 Py_UNICODE *uni2;
5180 Py_UNICODE *collstart;
5181 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005182
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005183 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005184 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005185 ++p;
5186 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005187 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005188 decimal = Py_UNICODE_TODECIMAL(ch);
5189 if (decimal >= 0) {
5190 *output++ = '0' + decimal;
5191 ++p;
5192 continue;
5193 }
5194 if (0 < ch && ch < 256) {
5195 *output++ = (char)ch;
5196 ++p;
5197 continue;
5198 }
5199 /* All other characters are considered unencodable */
5200 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005201 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005202 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005203 Py_UNICODE_ISSPACE(*collend) ||
5204 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005205 break;
5206 }
5207 /* cache callback name lookup
5208 * (if not done yet, i.e. it's the first error) */
5209 if (known_errorHandler==-1) {
5210 if ((errors==NULL) || (!strcmp(errors, "strict")))
5211 known_errorHandler = 1;
5212 else if (!strcmp(errors, "replace"))
5213 known_errorHandler = 2;
5214 else if (!strcmp(errors, "ignore"))
5215 known_errorHandler = 3;
5216 else if (!strcmp(errors, "xmlcharrefreplace"))
5217 known_errorHandler = 4;
5218 else
5219 known_errorHandler = 0;
5220 }
5221 switch (known_errorHandler) {
5222 case 1: /* strict */
5223 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5224 goto onError;
5225 case 2: /* replace */
5226 for (p = collstart; p < collend; ++p)
5227 *output++ = '?';
5228 /* fall through */
5229 case 3: /* ignore */
5230 p = collend;
5231 break;
5232 case 4: /* xmlcharrefreplace */
5233 /* generate replacement (temporarily (mis)uses p) */
5234 for (p = collstart; p < collend; ++p)
5235 output += sprintf(output, "&#%d;", (int)*p);
5236 p = collend;
5237 break;
5238 default:
5239 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5240 encoding, reason, s, length, &exc,
5241 collstart-s, collend-s, &newpos);
5242 if (repunicode == NULL)
5243 goto onError;
5244 /* generate replacement */
5245 repsize = PyUnicode_GET_SIZE(repunicode);
5246 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5247 Py_UNICODE ch = *uni2;
5248 if (Py_UNICODE_ISSPACE(ch))
5249 *output++ = ' ';
5250 else {
5251 decimal = Py_UNICODE_TODECIMAL(ch);
5252 if (decimal >= 0)
5253 *output++ = '0' + decimal;
5254 else if (0 < ch && ch < 256)
5255 *output++ = (char)ch;
5256 else {
5257 Py_DECREF(repunicode);
5258 raise_encode_exception(&exc, encoding,
5259 s, length, collstart-s, collend-s, reason);
5260 goto onError;
5261 }
5262 }
5263 }
5264 p = s + newpos;
5265 Py_DECREF(repunicode);
5266 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005267 }
5268 /* 0-terminate the output string */
5269 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005270 Py_XDECREF(exc);
5271 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005272 return 0;
5273
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005274 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005275 Py_XDECREF(exc);
5276 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005277 return -1;
5278}
5279
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280/* --- Helpers ------------------------------------------------------------ */
5281
Eric Smitha9f7d622008-02-17 19:46:49 +00005282#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005283#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005284
5285#include "stringlib/count.h"
5286#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005287#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005288#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005289
Fredrik Lundhc8162812006-05-26 19:33:03 +00005290/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005291#define ADJUST_INDICES(start, end, len) \
5292 if (end > len) \
5293 end = len; \
5294 else if (end < 0) { \
5295 end += len; \
5296 if (end < 0) \
5297 end = 0; \
5298 } \
5299 if (start < 0) { \
5300 start += len; \
5301 if (start < 0) \
5302 start = 0; \
5303 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005304
Martin v. Löwis18e16552006-02-15 17:27:45 +00005305Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005306 PyObject *substr,
5307 Py_ssize_t start,
5308 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005310 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005311 PyUnicodeObject* str_obj;
5312 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005313
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005314 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5315 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005316 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005317 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5318 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005319 Py_DECREF(str_obj);
5320 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 }
Tim Petersced69f82003-09-16 20:30:58 +00005322
Antoine Pitrou64672132010-01-13 07:55:48 +00005323 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005324 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005325 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5326 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005327 );
5328
5329 Py_DECREF(sub_obj);
5330 Py_DECREF(str_obj);
5331
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 return result;
5333}
5334
Martin v. Löwis18e16552006-02-15 17:27:45 +00005335Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005336 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005337 Py_ssize_t start,
5338 Py_ssize_t end,
5339 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005341 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005342
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005343 str = PyUnicode_FromObject(str);
5344 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005345 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005346 sub = PyUnicode_FromObject(sub);
5347 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005348 Py_DECREF(str);
5349 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 }
Tim Petersced69f82003-09-16 20:30:58 +00005351
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005352 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005353 result = stringlib_find_slice(
5354 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5355 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5356 start, end
5357 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005358 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005359 result = stringlib_rfind_slice(
5360 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5361 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5362 start, end
5363 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005364
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005365 Py_DECREF(str);
5366 Py_DECREF(sub);
5367
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 return result;
5369}
5370
Tim Petersced69f82003-09-16 20:30:58 +00005371static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005373 PyUnicodeObject *substring,
5374 Py_ssize_t start,
5375 Py_ssize_t end,
5376 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 if (substring->length == 0)
5379 return 1;
5380
Antoine Pitrou64672132010-01-13 07:55:48 +00005381 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 end -= substring->length;
5383 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005384 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385
5386 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005387 if (Py_UNICODE_MATCH(self, end, substring))
5388 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 } else {
5390 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005391 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 }
5393
5394 return 0;
5395}
5396
Martin v. Löwis18e16552006-02-15 17:27:45 +00005397Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005398 PyObject *substr,
5399 Py_ssize_t start,
5400 Py_ssize_t end,
5401 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005403 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005404
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 str = PyUnicode_FromObject(str);
5406 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005407 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 substr = PyUnicode_FromObject(substr);
5409 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005410 Py_DECREF(str);
5411 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 }
Tim Petersced69f82003-09-16 20:30:58 +00005413
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005415 (PyUnicodeObject *)substr,
5416 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 Py_DECREF(str);
5418 Py_DECREF(substr);
5419 return result;
5420}
5421
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422/* Apply fixfct filter to the Unicode object self and return a
5423 reference to the modified object */
5424
Tim Petersced69f82003-09-16 20:30:58 +00005425static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005427 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428{
5429
5430 PyUnicodeObject *u;
5431
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005432 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005434 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005435
5436 Py_UNICODE_COPY(u->str, self->str, self->length);
5437
Tim Peters7a29bd52001-09-12 03:03:31 +00005438 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005439 /* fixfct should return TRUE if it modified the buffer. If
5440 FALSE, return a reference to the original buffer instead
5441 (to save space, not time) */
5442 Py_INCREF(self);
5443 Py_DECREF(u);
5444 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 }
5446 return (PyObject*) u;
5447}
5448
Tim Petersced69f82003-09-16 20:30:58 +00005449static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450int fixupper(PyUnicodeObject *self)
5451{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005452 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 Py_UNICODE *s = self->str;
5454 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005455
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005457 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005458
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005459 ch = Py_UNICODE_TOUPPER(*s);
5460 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005462 *s = ch;
5463 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 s++;
5465 }
5466
5467 return status;
5468}
5469
Tim Petersced69f82003-09-16 20:30:58 +00005470static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471int fixlower(PyUnicodeObject *self)
5472{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005473 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 Py_UNICODE *s = self->str;
5475 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005476
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005478 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005479
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005480 ch = Py_UNICODE_TOLOWER(*s);
5481 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005483 *s = ch;
5484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 s++;
5486 }
5487
5488 return status;
5489}
5490
Tim Petersced69f82003-09-16 20:30:58 +00005491static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492int fixswapcase(PyUnicodeObject *self)
5493{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005494 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 Py_UNICODE *s = self->str;
5496 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005497
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 while (len-- > 0) {
5499 if (Py_UNICODE_ISUPPER(*s)) {
5500 *s = Py_UNICODE_TOLOWER(*s);
5501 status = 1;
5502 } else if (Py_UNICODE_ISLOWER(*s)) {
5503 *s = Py_UNICODE_TOUPPER(*s);
5504 status = 1;
5505 }
5506 s++;
5507 }
5508
5509 return status;
5510}
5511
Tim Petersced69f82003-09-16 20:30:58 +00005512static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513int fixcapitalize(PyUnicodeObject *self)
5514{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005515 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005516 Py_UNICODE *s = self->str;
5517 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005518
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005519 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005520 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005521 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005522 *s = Py_UNICODE_TOUPPER(*s);
5523 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005525 s++;
5526 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005527 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005528 *s = Py_UNICODE_TOLOWER(*s);
5529 status = 1;
5530 }
5531 s++;
5532 }
5533 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534}
5535
5536static
5537int fixtitle(PyUnicodeObject *self)
5538{
5539 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5540 register Py_UNICODE *e;
5541 int previous_is_cased;
5542
5543 /* Shortcut for single character strings */
5544 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005545 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5546 if (*p != ch) {
5547 *p = ch;
5548 return 1;
5549 }
5550 else
5551 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 }
Tim Petersced69f82003-09-16 20:30:58 +00005553
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 e = p + PyUnicode_GET_SIZE(self);
5555 previous_is_cased = 0;
5556 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005557 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005558
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005559 if (previous_is_cased)
5560 *p = Py_UNICODE_TOLOWER(ch);
5561 else
5562 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005563
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005564 if (Py_UNICODE_ISLOWER(ch) ||
5565 Py_UNICODE_ISUPPER(ch) ||
5566 Py_UNICODE_ISTITLE(ch))
5567 previous_is_cased = 1;
5568 else
5569 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 }
5571 return 1;
5572}
5573
Tim Peters8ce9f162004-08-27 01:49:32 +00005574PyObject *
5575PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576{
Tim Peters8ce9f162004-08-27 01:49:32 +00005577 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005578 const Py_UNICODE blank = ' ';
5579 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005580 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005581 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005582 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5583 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005584 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5585 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005586 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005587 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005588 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589
Tim Peters05eba1f2004-08-27 21:32:02 +00005590 fseq = PySequence_Fast(seq, "");
5591 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005592 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005593 }
5594
Tim Peters91879ab2004-08-27 22:35:44 +00005595 /* Grrrr. A codec may be invoked to convert str objects to
5596 * Unicode, and so it's possible to call back into Python code
5597 * during PyUnicode_FromObject(), and so it's possible for a sick
5598 * codec to change the size of fseq (if seq is a list). Therefore
5599 * we have to keep refetching the size -- can't assume seqlen
5600 * is invariant.
5601 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005602 seqlen = PySequence_Fast_GET_SIZE(fseq);
5603 /* If empty sequence, return u"". */
5604 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005605 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5606 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005607 }
5608 /* If singleton sequence with an exact Unicode, return that. */
5609 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005610 item = PySequence_Fast_GET_ITEM(fseq, 0);
5611 if (PyUnicode_CheckExact(item)) {
5612 Py_INCREF(item);
5613 res = (PyUnicodeObject *)item;
5614 goto Done;
5615 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005616 }
5617
Tim Peters05eba1f2004-08-27 21:32:02 +00005618 /* At least two items to join, or one that isn't exact Unicode. */
5619 if (seqlen > 1) {
5620 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005621 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005622 sep = &blank;
5623 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005624 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005625 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005626 internal_separator = PyUnicode_FromObject(separator);
5627 if (internal_separator == NULL)
5628 goto onError;
5629 sep = PyUnicode_AS_UNICODE(internal_separator);
5630 seplen = PyUnicode_GET_SIZE(internal_separator);
5631 /* In case PyUnicode_FromObject() mutated seq. */
5632 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005633 }
5634 }
5635
5636 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005637 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005638 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005639 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005640 res_p = PyUnicode_AS_UNICODE(res);
5641 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005642
Tim Peters05eba1f2004-08-27 21:32:02 +00005643 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005644 Py_ssize_t itemlen;
5645 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005646
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005647 item = PySequence_Fast_GET_ITEM(fseq, i);
5648 /* Convert item to Unicode. */
5649 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5650 PyErr_Format(PyExc_TypeError,
5651 "sequence item %zd: expected string or Unicode,"
5652 " %.80s found",
5653 i, Py_TYPE(item)->tp_name);
5654 goto onError;
5655 }
5656 item = PyUnicode_FromObject(item);
5657 if (item == NULL)
5658 goto onError;
5659 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005660
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005661 /* In case PyUnicode_FromObject() mutated seq. */
5662 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005663
Tim Peters8ce9f162004-08-27 01:49:32 +00005664 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005665 itemlen = PyUnicode_GET_SIZE(item);
5666 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005667 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005668 goto Overflow;
5669 if (i < seqlen - 1) {
5670 new_res_used += seplen;
5671 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005672 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005673 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005674 if (new_res_used > res_alloc) {
5675 /* double allocated size until it's big enough */
5676 do {
5677 res_alloc += res_alloc;
5678 if (res_alloc <= 0)
5679 goto Overflow;
5680 } while (new_res_used > res_alloc);
5681 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5682 Py_DECREF(item);
5683 goto onError;
5684 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005685 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005686 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005687
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005688 /* Copy item, and maybe the separator. */
5689 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5690 res_p += itemlen;
5691 if (i < seqlen - 1) {
5692 Py_UNICODE_COPY(res_p, sep, seplen);
5693 res_p += seplen;
5694 }
5695 Py_DECREF(item);
5696 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005697 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005698
Tim Peters05eba1f2004-08-27 21:32:02 +00005699 /* Shrink res to match the used area; this probably can't fail,
5700 * but it's cheap to check.
5701 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005702 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005703 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005704
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005705 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005706 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005707 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 return (PyObject *)res;
5709
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005710 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005711 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005712 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005713 Py_DECREF(item);
5714 /* fall through */
5715
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005716 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005717 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005718 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005719 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 return NULL;
5721}
5722
Tim Petersced69f82003-09-16 20:30:58 +00005723static
5724PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005725 Py_ssize_t left,
5726 Py_ssize_t right,
5727 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728{
5729 PyUnicodeObject *u;
5730
5731 if (left < 0)
5732 left = 0;
5733 if (right < 0)
5734 right = 0;
5735
Tim Peters7a29bd52001-09-12 03:03:31 +00005736 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 Py_INCREF(self);
5738 return self;
5739 }
5740
Neal Norwitze7d8be82008-07-31 17:17:14 +00005741 if (left > PY_SSIZE_T_MAX - self->length ||
5742 right > PY_SSIZE_T_MAX - (left + self->length)) {
5743 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5744 return NULL;
5745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 u = _PyUnicode_New(left + self->length + right);
5747 if (u) {
5748 if (left)
5749 Py_UNICODE_FILL(u->str, fill, left);
5750 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5751 if (right)
5752 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5753 }
5754
5755 return u;
5756}
5757
Antoine Pitrou64672132010-01-13 07:55:48 +00005758PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
5762 string = PyUnicode_FromObject(string);
5763 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765
Antoine Pitrou64672132010-01-13 07:55:48 +00005766 list = stringlib_splitlines(
5767 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5768 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769
5770 Py_DECREF(string);
5771 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772}
5773
Tim Petersced69f82003-09-16 20:30:58 +00005774static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005776 PyUnicodeObject *substring,
5777 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005780 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005783 return stringlib_split_whitespace(
5784 (PyObject*) self, self->str, self->length, maxcount
5785 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786
Antoine Pitrou64672132010-01-13 07:55:48 +00005787 return stringlib_split(
5788 (PyObject*) self, self->str, self->length,
5789 substring->str, substring->length,
5790 maxcount
5791 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792}
5793
Tim Petersced69f82003-09-16 20:30:58 +00005794static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005795PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005796 PyUnicodeObject *substring,
5797 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005798{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005799 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005800 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005801
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005802 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005803 return stringlib_rsplit_whitespace(
5804 (PyObject*) self, self->str, self->length, maxcount
5805 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005806
Antoine Pitrou64672132010-01-13 07:55:48 +00005807 return stringlib_rsplit(
5808 (PyObject*) self, self->str, self->length,
5809 substring->str, substring->length,
5810 maxcount
5811 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005812}
5813
5814static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005816 PyUnicodeObject *str1,
5817 PyUnicodeObject *str2,
5818 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819{
5820 PyUnicodeObject *u;
5821
5822 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005823 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005824 else if (maxcount == 0 || self->length == 0)
5825 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826
Fredrik Lundh347ee272006-05-24 16:35:18 +00005827 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005828 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005829 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005830 if (str1->length == 0)
5831 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005832 if (str1->length == 1) {
5833 /* replace characters */
5834 Py_UNICODE u1, u2;
5835 if (!findchar(self->str, self->length, str1->str[0]))
5836 goto nothing;
5837 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5838 if (!u)
5839 return NULL;
5840 Py_UNICODE_COPY(u->str, self->str, self->length);
5841 u1 = str1->str[0];
5842 u2 = str2->str[0];
5843 for (i = 0; i < u->length; i++)
5844 if (u->str[i] == u1) {
5845 if (--maxcount < 0)
5846 break;
5847 u->str[i] = u2;
5848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005850 i = stringlib_find(
5851 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005853 if (i < 0)
5854 goto nothing;
5855 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5856 if (!u)
5857 return NULL;
5858 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005859
5860 /* change everything in-place, starting with this one */
5861 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5862 i += str1->length;
5863
5864 while ( --maxcount > 0) {
5865 i = stringlib_find(self->str+i, self->length-i,
5866 str1->str, str1->length,
5867 i);
5868 if (i == -1)
5869 break;
5870 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5871 i += str1->length;
5872 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005875
Brett Cannona7f13ee2010-05-04 01:16:51 +00005876 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005877 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 Py_UNICODE *p;
5879
5880 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005881 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5882 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005883 if (n == 0)
5884 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005885 /* new_size = self->length + n * (str2->length - str1->length)); */
5886 delta = (str2->length - str1->length);
5887 if (delta == 0) {
5888 new_size = self->length;
5889 } else {
5890 product = n * (str2->length - str1->length);
5891 if ((product / (str2->length - str1->length)) != n) {
5892 PyErr_SetString(PyExc_OverflowError,
5893 "replace string is too long");
5894 return NULL;
5895 }
5896 new_size = self->length + product;
5897 if (new_size < 0) {
5898 PyErr_SetString(PyExc_OverflowError,
5899 "replace string is too long");
5900 return NULL;
5901 }
5902 }
5903 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005904 if (!u)
5905 return NULL;
5906 i = 0;
5907 p = u->str;
5908 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005909 while (n-- > 0) {
5910 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005911 j = stringlib_find(self->str+i, self->length-i,
5912 str1->str, str1->length,
5913 i);
5914 if (j == -1)
5915 break;
5916 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005917 /* copy unchanged part [i:j] */
5918 Py_UNICODE_COPY(p, self->str+i, j-i);
5919 p += j - i;
5920 }
5921 /* copy substitution string */
5922 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005923 Py_UNICODE_COPY(p, str2->str, str2->length);
5924 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005925 }
5926 i = j + str1->length;
5927 }
5928 if (i < self->length)
5929 /* copy tail [i:] */
5930 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005931 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005932 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005933 while (n > 0) {
5934 Py_UNICODE_COPY(p, str2->str, str2->length);
5935 p += str2->length;
5936 if (--n <= 0)
5937 break;
5938 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005940 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 }
5942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005944
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005945 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005946 /* nothing to replace; return original string (when possible) */
5947 if (PyUnicode_CheckExact(self)) {
5948 Py_INCREF(self);
5949 return (PyObject *) self;
5950 }
5951 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952}
5953
5954/* --- Unicode Object Methods --------------------------------------------- */
5955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005956PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005957 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958\n\
5959Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005960characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961
5962static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005963unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 return fixup(self, fixtitle);
5966}
5967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005968PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005969 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970\n\
5971Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00005972have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
5974static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005975unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 return fixup(self, fixcapitalize);
5978}
5979
5980#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005981PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005982 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983\n\
5984Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005985normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986
5987static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005988unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989{
5990 PyObject *list;
5991 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005992 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 /* Split into words */
5995 list = split(self, NULL, -1);
5996 if (!list)
5997 return NULL;
5998
5999 /* Capitalize each word */
6000 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6001 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006002 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 if (item == NULL)
6004 goto onError;
6005 Py_DECREF(PyList_GET_ITEM(list, i));
6006 PyList_SET_ITEM(list, i, item);
6007 }
6008
6009 /* Join the words to form a new string */
6010 item = PyUnicode_Join(NULL, list);
6011
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006012 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 Py_DECREF(list);
6014 return (PyObject *)item;
6015}
6016#endif
6017
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006018/* Argument converter. Coerces to a single unicode character */
6019
6020static int
6021convert_uc(PyObject *obj, void *addr)
6022{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006023 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6024 PyObject *uniobj;
6025 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006026
Benjamin Peterson857ce152009-01-31 16:29:18 +00006027 uniobj = PyUnicode_FromObject(obj);
6028 if (uniobj == NULL) {
6029 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006030 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006031 return 0;
6032 }
6033 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6034 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006035 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006036 Py_DECREF(uniobj);
6037 return 0;
6038 }
6039 unistr = PyUnicode_AS_UNICODE(uniobj);
6040 *fillcharloc = unistr[0];
6041 Py_DECREF(uniobj);
6042 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006043}
6044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006045PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006046 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006048Return S centered in a Unicode string of length width. Padding is\n\
6049done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
6051static PyObject *
6052unicode_center(PyUnicodeObject *self, PyObject *args)
6053{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006054 Py_ssize_t marg, left;
6055 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006056 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057
Thomas Woutersde017742006-02-16 19:34:37 +00006058 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 return NULL;
6060
Tim Peters7a29bd52001-09-12 03:03:31 +00006061 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 Py_INCREF(self);
6063 return (PyObject*) self;
6064 }
6065
6066 marg = width - self->length;
6067 left = marg / 2 + (marg & width & 1);
6068
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006069 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070}
6071
Marc-André Lemburge5034372000-08-08 08:04:29 +00006072#if 0
6073
6074/* This code should go into some future Unicode collation support
6075 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006076 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006077
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006078/* speedy UTF-16 code point order comparison */
6079/* gleaned from: */
6080/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6081
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006082static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006083{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006084 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006085 0, 0, 0, 0, 0, 0, 0, 0,
6086 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006087 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006088};
6089
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090static int
6091unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6092{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006093 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006094
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 Py_UNICODE *s1 = str1->str;
6096 Py_UNICODE *s2 = str2->str;
6097
6098 len1 = str1->length;
6099 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006100
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006102 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006103
6104 c1 = *s1++;
6105 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006106
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006107 if (c1 > (1<<11) * 26)
6108 c1 += utf16Fixup[c1>>11];
6109 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006110 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006111 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006112
6113 if (c1 != c2)
6114 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006115
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006116 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 }
6118
6119 return (len1 < len2) ? -1 : (len1 != len2);
6120}
6121
Marc-André Lemburge5034372000-08-08 08:04:29 +00006122#else
6123
6124static int
6125unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6126{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006127 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006128
6129 Py_UNICODE *s1 = str1->str;
6130 Py_UNICODE *s2 = str2->str;
6131
6132 len1 = str1->length;
6133 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006134
Marc-André Lemburge5034372000-08-08 08:04:29 +00006135 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006136 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006137
Fredrik Lundh45714e92001-06-26 16:39:36 +00006138 c1 = *s1++;
6139 c2 = *s2++;
6140
6141 if (c1 != c2)
6142 return (c1 < c2) ? -1 : 1;
6143
Marc-André Lemburge5034372000-08-08 08:04:29 +00006144 len1--; len2--;
6145 }
6146
6147 return (len1 < len2) ? -1 : (len1 != len2);
6148}
6149
6150#endif
6151
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006153 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154{
6155 PyUnicodeObject *u = NULL, *v = NULL;
6156 int result;
6157
6158 /* Coerce the two arguments */
6159 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6160 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006161 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6163 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006164 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165
Thomas Wouters7e474022000-07-16 12:04:32 +00006166 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006168 Py_DECREF(u);
6169 Py_DECREF(v);
6170 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 }
6172
6173 result = unicode_compare(u, v);
6174
6175 Py_DECREF(u);
6176 Py_DECREF(v);
6177 return result;
6178
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006179 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 Py_XDECREF(u);
6181 Py_XDECREF(v);
6182 return -1;
6183}
6184
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006185PyObject *PyUnicode_RichCompare(PyObject *left,
6186 PyObject *right,
6187 int op)
6188{
6189 int result;
6190
6191 result = PyUnicode_Compare(left, right);
6192 if (result == -1 && PyErr_Occurred())
6193 goto onError;
6194
6195 /* Convert the return value to a Boolean */
6196 switch (op) {
6197 case Py_EQ:
6198 result = (result == 0);
6199 break;
6200 case Py_NE:
6201 result = (result != 0);
6202 break;
6203 case Py_LE:
6204 result = (result <= 0);
6205 break;
6206 case Py_GE:
6207 result = (result >= 0);
6208 break;
6209 case Py_LT:
6210 result = (result == -1);
6211 break;
6212 case Py_GT:
6213 result = (result == 1);
6214 break;
6215 }
6216 return PyBool_FromLong(result);
6217
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006218 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006219
6220 /* Standard case
6221
6222 Type errors mean that PyUnicode_FromObject() could not convert
6223 one of the arguments (usually the right hand side) to Unicode,
6224 ie. we can't handle the comparison request. However, it is
6225 possible that the other object knows a comparison method, which
6226 is why we return Py_NotImplemented to give the other object a
6227 chance.
6228
6229 */
6230 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6231 PyErr_Clear();
6232 Py_INCREF(Py_NotImplemented);
6233 return Py_NotImplemented;
6234 }
6235 if (op != Py_EQ && op != Py_NE)
6236 return NULL;
6237
6238 /* Equality comparison.
6239
6240 This is a special case: we silence any PyExc_UnicodeDecodeError
6241 and instead turn it into a PyErr_UnicodeWarning.
6242
6243 */
6244 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6245 return NULL;
6246 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006247 if (PyErr_Warn(PyExc_UnicodeWarning,
6248 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006249 "Unicode equal comparison "
6250 "failed to convert both arguments to Unicode - "
6251 "interpreting them as being unequal" :
6252 "Unicode unequal comparison "
6253 "failed to convert both arguments to Unicode - "
6254 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006255 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006256 return NULL;
6257 result = (op == Py_NE);
6258 return PyBool_FromLong(result);
6259}
6260
Guido van Rossum403d68b2000-03-13 15:55:09 +00006261int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006262 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006263{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006264 PyObject *str, *sub;
6265 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006266
6267 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006268 sub = PyUnicode_FromObject(element);
6269 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006270 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006271 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006272
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006273 str = PyUnicode_FromObject(container);
6274 if (!str) {
6275 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006276 return -1;
6277 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006278
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006279 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006280
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006281 Py_DECREF(str);
6282 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006283
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006284 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006285}
6286
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287/* Concat to string or Unicode object giving a new Unicode object. */
6288
6289PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006290 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291{
6292 PyUnicodeObject *u = NULL, *v = NULL, *w;
6293
6294 /* Coerce the two arguments */
6295 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6296 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006297 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6299 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006300 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301
6302 /* Shortcuts */
6303 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006304 Py_DECREF(v);
6305 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 }
6307 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006308 Py_DECREF(u);
6309 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310 }
6311
6312 /* Concat the two Unicode strings */
6313 w = _PyUnicode_New(u->length + v->length);
6314 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 Py_UNICODE_COPY(w->str, u->str, u->length);
6317 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6318
6319 Py_DECREF(u);
6320 Py_DECREF(v);
6321 return (PyObject *)w;
6322
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006323 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 Py_XDECREF(u);
6325 Py_XDECREF(v);
6326 return NULL;
6327}
6328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006329PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006330 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006332Return the number of non-overlapping occurrences of substring sub in\n\
6333Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006334interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335
6336static PyObject *
6337unicode_count(PyUnicodeObject *self, PyObject *args)
6338{
6339 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006340 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006341 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342 PyObject *result;
6343
Jesus Cea44e81682011-04-20 16:39:15 +02006344 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6345 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006346 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006347
Antoine Pitrou64672132010-01-13 07:55:48 +00006348 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006349 result = PyInt_FromSsize_t(
6350 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006351 substring->str, substring->length,
6352 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006353 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354
6355 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006356
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 return result;
6358}
6359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006360PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006361 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006363Encodes S using the codec registered for encoding. encoding defaults\n\
6364to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006365handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006366a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6367'xmlcharrefreplace' as well as any other name registered with\n\
6368codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369
6370static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006371unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006373 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 char *encoding = NULL;
6375 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006376 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006377
Benjamin Peterson332d7212009-09-18 21:14:55 +00006378 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6379 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006381 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006382 if (v == NULL)
6383 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006384 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006385 PyErr_Format(PyExc_TypeError,
6386 "encoder did not return a string/unicode object "
6387 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006388 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006389 Py_DECREF(v);
6390 return NULL;
6391 }
6392 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006393
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006394 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006395 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006396}
6397
6398PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006399 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006400\n\
6401Decodes S using the codec registered for encoding. encoding defaults\n\
6402to the default encoding. errors may be given to set a different error\n\
6403handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6404a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006405as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006406able to handle UnicodeDecodeErrors.");
6407
6408static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006409unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006410{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006411 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006412 char *encoding = NULL;
6413 char *errors = NULL;
6414 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006415
Benjamin Peterson332d7212009-09-18 21:14:55 +00006416 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6417 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006418 return NULL;
6419 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006420 if (v == NULL)
6421 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006422 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006423 PyErr_Format(PyExc_TypeError,
6424 "decoder did not return a string/unicode object "
6425 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006426 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006427 Py_DECREF(v);
6428 return NULL;
6429 }
6430 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006431
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006432 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006433 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434}
6435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006436PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006437 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438\n\
6439Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006440If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441
6442static PyObject*
6443unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6444{
6445 Py_UNICODE *e;
6446 Py_UNICODE *p;
6447 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006448 Py_UNICODE *qe;
6449 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 PyUnicodeObject *u;
6451 int tabsize = 8;
6452
6453 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455
Thomas Wouters7e474022000-07-16 12:04:32 +00006456 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006457 i = 0; /* chars up to and including most recent \n or \r */
6458 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6459 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 for (p = self->str; p < e; p++)
6461 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006462 if (tabsize > 0) {
6463 incr = tabsize - (j % tabsize); /* cannot overflow */
6464 if (j > PY_SSIZE_T_MAX - incr)
6465 goto overflow1;
6466 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006467 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006470 if (j > PY_SSIZE_T_MAX - 1)
6471 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 j++;
6473 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006474 if (i > PY_SSIZE_T_MAX - j)
6475 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006477 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 }
6479 }
6480
Guido van Rossum5bdff602008-03-11 21:18:06 +00006481 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006482 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006483
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 /* Second pass: create output string and fill it */
6485 u = _PyUnicode_New(i + j);
6486 if (!u)
6487 return NULL;
6488
Guido van Rossum5bdff602008-03-11 21:18:06 +00006489 j = 0; /* same as in first pass */
6490 q = u->str; /* next output char */
6491 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492
6493 for (p = self->str; p < e; p++)
6494 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006495 if (tabsize > 0) {
6496 i = tabsize - (j % tabsize);
6497 j += i;
6498 while (i--) {
6499 if (q >= qe)
6500 goto overflow2;
6501 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006502 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006503 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006504 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006505 else {
6506 if (q >= qe)
6507 goto overflow2;
6508 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006509 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 if (*p == '\n' || *p == '\r')
6511 j = 0;
6512 }
6513
6514 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006515
6516 overflow2:
6517 Py_DECREF(u);
6518 overflow1:
6519 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521}
6522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006523PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006524 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525\n\
6526Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006527such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528arguments start and end are interpreted as in slice notation.\n\
6529\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006530Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531
6532static PyObject *
6533unicode_find(PyUnicodeObject *self, PyObject *args)
6534{
Jesus Cea44e81682011-04-20 16:39:15 +02006535 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006536 Py_ssize_t start;
6537 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006538 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539
Jesus Cea44e81682011-04-20 16:39:15 +02006540 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6541 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006544 result = stringlib_find_slice(
6545 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6546 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6547 start, end
6548 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549
6550 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006551
6552 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553}
6554
6555static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006556unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557{
6558 if (index < 0 || index >= self->length) {
6559 PyErr_SetString(PyExc_IndexError, "string index out of range");
6560 return NULL;
6561 }
6562
6563 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6564}
6565
6566static long
6567unicode_hash(PyUnicodeObject *self)
6568{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006569 /* Since Unicode objects compare equal to their ASCII string
6570 counterparts, they should use the individual character values
6571 as basis for their hash value. This is needed to assure that
6572 strings and Unicode objects behave in the same way as
6573 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574
Martin v. Löwis18e16552006-02-15 17:27:45 +00006575 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006576 register Py_UNICODE *p;
6577 register long x;
6578
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006579#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006580 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006581#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006583 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006584 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006585 /*
6586 We make the hash of the empty string be 0, rather than using
6587 (prefix ^ suffix), since this slightly obfuscates the hash secret
6588 */
6589 if (len == 0) {
6590 self->hash = 0;
6591 return 0;
6592 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006593 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006594 x = _Py_HashSecret.prefix;
6595 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006596 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006597 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006598 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006599 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006600 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006601 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006602 self->hash = x;
6603 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604}
6605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006606PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006607 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006609Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610
6611static PyObject *
6612unicode_index(PyUnicodeObject *self, PyObject *args)
6613{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006614 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006615 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006616 Py_ssize_t start;
6617 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618
Jesus Cea44e81682011-04-20 16:39:15 +02006619 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6620 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006623 result = stringlib_find_slice(
6624 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6625 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6626 start, end
6627 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628
6629 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006630
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 if (result < 0) {
6632 PyErr_SetString(PyExc_ValueError, "substring not found");
6633 return NULL;
6634 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006635
Martin v. Löwis18e16552006-02-15 17:27:45 +00006636 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637}
6638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006639PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006640 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006642Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006643at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644
6645static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006646unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647{
6648 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6649 register const Py_UNICODE *e;
6650 int cased;
6651
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 /* Shortcut for single character strings */
6653 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006654 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006656 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006657 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006658 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006659
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 e = p + PyUnicode_GET_SIZE(self);
6661 cased = 0;
6662 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006663 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006664
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006665 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6666 return PyBool_FromLong(0);
6667 else if (!cased && Py_UNICODE_ISLOWER(ch))
6668 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006670 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671}
6672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006673PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006674 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006676Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006677at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678
6679static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006680unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681{
6682 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6683 register const Py_UNICODE *e;
6684 int cased;
6685
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 /* Shortcut for single character strings */
6687 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006688 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006690 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006691 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006692 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006693
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 e = p + PyUnicode_GET_SIZE(self);
6695 cased = 0;
6696 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006697 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006698
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006699 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6700 return PyBool_FromLong(0);
6701 else if (!cased && Py_UNICODE_ISUPPER(ch))
6702 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006704 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705}
6706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006707PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006708 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006710Return True if S is a titlecased string and there is at least one\n\
6711character in S, i.e. upper- and titlecase characters may only\n\
6712follow uncased characters and lowercase characters only cased ones.\n\
6713Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714
6715static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006716unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717{
6718 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6719 register const Py_UNICODE *e;
6720 int cased, previous_is_cased;
6721
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 /* Shortcut for single character strings */
6723 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006724 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6725 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006727 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006728 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006729 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006730
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 e = p + PyUnicode_GET_SIZE(self);
6732 cased = 0;
6733 previous_is_cased = 0;
6734 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006735 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006736
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006737 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6738 if (previous_is_cased)
6739 return PyBool_FromLong(0);
6740 previous_is_cased = 1;
6741 cased = 1;
6742 }
6743 else if (Py_UNICODE_ISLOWER(ch)) {
6744 if (!previous_is_cased)
6745 return PyBool_FromLong(0);
6746 previous_is_cased = 1;
6747 cased = 1;
6748 }
6749 else
6750 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006752 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753}
6754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006755PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006756 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006758Return True if all characters in S are whitespace\n\
6759and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760
6761static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006762unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763{
6764 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6765 register const Py_UNICODE *e;
6766
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 /* Shortcut for single character strings */
6768 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006769 Py_UNICODE_ISSPACE(*p))
6770 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006772 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006773 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006774 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006775
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 e = p + PyUnicode_GET_SIZE(self);
6777 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006778 if (!Py_UNICODE_ISSPACE(*p))
6779 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006781 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782}
6783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006784PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006785 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006786\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006787Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006788and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006789
6790static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006791unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006792{
6793 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6794 register const Py_UNICODE *e;
6795
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006796 /* Shortcut for single character strings */
6797 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006798 Py_UNICODE_ISALPHA(*p))
6799 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006800
6801 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006802 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006803 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006804
6805 e = p + PyUnicode_GET_SIZE(self);
6806 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006807 if (!Py_UNICODE_ISALPHA(*p))
6808 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006809 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006810 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006811}
6812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006813PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006814 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006815\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006816Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006817and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006818
6819static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006820unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006821{
6822 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6823 register const Py_UNICODE *e;
6824
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006825 /* Shortcut for single character strings */
6826 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006827 Py_UNICODE_ISALNUM(*p))
6828 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006829
6830 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006831 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006832 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006833
6834 e = p + PyUnicode_GET_SIZE(self);
6835 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006836 if (!Py_UNICODE_ISALNUM(*p))
6837 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006838 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006839 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006840}
6841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006842PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006843 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006845Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006846False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847
6848static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006849unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850{
6851 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6852 register const Py_UNICODE *e;
6853
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 /* Shortcut for single character strings */
6855 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006856 Py_UNICODE_ISDECIMAL(*p))
6857 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006859 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006860 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006861 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006862
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 e = p + PyUnicode_GET_SIZE(self);
6864 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006865 if (!Py_UNICODE_ISDECIMAL(*p))
6866 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006868 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869}
6870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006871PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006872 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006874Return True if all characters in S are digits\n\
6875and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876
6877static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006878unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879{
6880 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6881 register const Py_UNICODE *e;
6882
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 /* Shortcut for single character strings */
6884 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006885 Py_UNICODE_ISDIGIT(*p))
6886 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006888 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006889 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006890 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006891
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 e = p + PyUnicode_GET_SIZE(self);
6893 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006894 if (!Py_UNICODE_ISDIGIT(*p))
6895 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006897 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898}
6899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006900PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006901 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006903Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006904False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905
6906static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006907unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908{
6909 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6910 register const Py_UNICODE *e;
6911
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 /* Shortcut for single character strings */
6913 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006914 Py_UNICODE_ISNUMERIC(*p))
6915 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006917 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006918 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006919 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006920
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 e = p + PyUnicode_GET_SIZE(self);
6922 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006923 if (!Py_UNICODE_ISNUMERIC(*p))
6924 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006926 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927}
6928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006929PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006930 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931\n\
6932Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006933iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934
6935static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006936unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006938 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939}
6940
Martin v. Löwis18e16552006-02-15 17:27:45 +00006941static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942unicode_length(PyUnicodeObject *self)
6943{
6944 return self->length;
6945}
6946
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006947PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006948 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006950Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006951done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952
6953static PyObject *
6954unicode_ljust(PyUnicodeObject *self, PyObject *args)
6955{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006956 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006957 Py_UNICODE fillchar = ' ';
6958
Martin v. Löwis412fb672006-04-13 06:34:32 +00006959 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 return NULL;
6961
Tim Peters7a29bd52001-09-12 03:03:31 +00006962 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 Py_INCREF(self);
6964 return (PyObject*) self;
6965 }
6966
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006967 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968}
6969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006970PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006971 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006973Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974
6975static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006976unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 return fixup(self, fixlower);
6979}
6980
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006981#define LEFTSTRIP 0
6982#define RIGHTSTRIP 1
6983#define BOTHSTRIP 2
6984
6985/* Arrays indexed by above */
6986static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6987
6988#define STRIPNAME(i) (stripformat[i]+3)
6989
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006990/* externally visible for str.strip(unicode) */
6991PyObject *
6992_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6993{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006994 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6995 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6996 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6997 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6998 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006999
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007000 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007001
Benjamin Peterson857ce152009-01-31 16:29:18 +00007002 i = 0;
7003 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007004 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7005 i++;
7006 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007007 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007008
Benjamin Peterson857ce152009-01-31 16:29:18 +00007009 j = len;
7010 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007011 do {
7012 j--;
7013 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7014 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007015 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007016
Benjamin Peterson857ce152009-01-31 16:29:18 +00007017 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007018 Py_INCREF(self);
7019 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007020 }
7021 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007022 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007023}
7024
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025
7026static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007027do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007029 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7030 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007031
Benjamin Peterson857ce152009-01-31 16:29:18 +00007032 i = 0;
7033 if (striptype != RIGHTSTRIP) {
7034 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7035 i++;
7036 }
7037 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007038
Benjamin Peterson857ce152009-01-31 16:29:18 +00007039 j = len;
7040 if (striptype != LEFTSTRIP) {
7041 do {
7042 j--;
7043 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7044 j++;
7045 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007046
Benjamin Peterson857ce152009-01-31 16:29:18 +00007047 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7048 Py_INCREF(self);
7049 return (PyObject*)self;
7050 }
7051 else
7052 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053}
7054
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007055
7056static PyObject *
7057do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7058{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007059 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007060
Benjamin Peterson857ce152009-01-31 16:29:18 +00007061 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7062 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007063
Benjamin Peterson857ce152009-01-31 16:29:18 +00007064 if (sep != NULL && sep != Py_None) {
7065 if (PyUnicode_Check(sep))
7066 return _PyUnicode_XStrip(self, striptype, sep);
7067 else if (PyString_Check(sep)) {
7068 PyObject *res;
7069 sep = PyUnicode_FromObject(sep);
7070 if (sep==NULL)
7071 return NULL;
7072 res = _PyUnicode_XStrip(self, striptype, sep);
7073 Py_DECREF(sep);
7074 return res;
7075 }
7076 else {
7077 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007078 "%s arg must be None, unicode or str",
7079 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007080 return NULL;
7081 }
7082 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007083
Benjamin Peterson857ce152009-01-31 16:29:18 +00007084 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007085}
7086
7087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007088PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007089 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007090\n\
7091Return a copy of the string S with leading and trailing\n\
7092whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007093If chars is given and not None, remove characters in chars instead.\n\
7094If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007095
7096static PyObject *
7097unicode_strip(PyUnicodeObject *self, PyObject *args)
7098{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007099 if (PyTuple_GET_SIZE(args) == 0)
7100 return do_strip(self, BOTHSTRIP); /* Common case */
7101 else
7102 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007103}
7104
7105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007106PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007107 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007108\n\
7109Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007110If chars is given and not None, remove characters in chars instead.\n\
7111If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007112
7113static PyObject *
7114unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7115{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007116 if (PyTuple_GET_SIZE(args) == 0)
7117 return do_strip(self, LEFTSTRIP); /* Common case */
7118 else
7119 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007120}
7121
7122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007123PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007124 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007125\n\
7126Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007127If chars is given and not None, remove characters in chars instead.\n\
7128If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007129
7130static PyObject *
7131unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7132{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007133 if (PyTuple_GET_SIZE(args) == 0)
7134 return do_strip(self, RIGHTSTRIP); /* Common case */
7135 else
7136 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007137}
7138
7139
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007141unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142{
7143 PyUnicodeObject *u;
7144 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007145 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007146 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147
7148 if (len < 0)
7149 len = 0;
7150
Tim Peters7a29bd52001-09-12 03:03:31 +00007151 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 /* no repeat, return original string */
7153 Py_INCREF(str);
7154 return (PyObject*) str;
7155 }
Tim Peters8f422462000-09-09 06:13:41 +00007156
7157 /* ensure # of chars needed doesn't overflow int and # of bytes
7158 * needed doesn't overflow size_t
7159 */
7160 nchars = len * str->length;
7161 if (len && nchars / len != str->length) {
7162 PyErr_SetString(PyExc_OverflowError,
7163 "repeated string is too long");
7164 return NULL;
7165 }
7166 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7167 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7168 PyErr_SetString(PyExc_OverflowError,
7169 "repeated string is too long");
7170 return NULL;
7171 }
7172 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 if (!u)
7174 return NULL;
7175
7176 p = u->str;
7177
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007178 if (str->length == 1 && len > 0) {
7179 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007180 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007181 Py_ssize_t done = 0; /* number of characters copied this far */
7182 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007183 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007184 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007185 }
7186 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007187 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007188 Py_UNICODE_COPY(p+done, p, n);
7189 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007190 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192
7193 return (PyObject*) u;
7194}
7195
7196PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007197 PyObject *subobj,
7198 PyObject *replobj,
7199 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200{
7201 PyObject *self;
7202 PyObject *str1;
7203 PyObject *str2;
7204 PyObject *result;
7205
7206 self = PyUnicode_FromObject(obj);
7207 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209 str1 = PyUnicode_FromObject(subobj);
7210 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007211 Py_DECREF(self);
7212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213 }
7214 str2 = PyUnicode_FromObject(replobj);
7215 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007216 Py_DECREF(self);
7217 Py_DECREF(str1);
7218 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 }
Tim Petersced69f82003-09-16 20:30:58 +00007220 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007221 (PyUnicodeObject *)str1,
7222 (PyUnicodeObject *)str2,
7223 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 Py_DECREF(self);
7225 Py_DECREF(str1);
7226 Py_DECREF(str2);
7227 return result;
7228}
7229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007230PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007231 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232\n\
7233Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007234old replaced by new. If the optional argument count is\n\
7235given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236
7237static PyObject*
7238unicode_replace(PyUnicodeObject *self, PyObject *args)
7239{
7240 PyUnicodeObject *str1;
7241 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007242 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243 PyObject *result;
7244
Martin v. Löwis18e16552006-02-15 17:27:45 +00007245 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 return NULL;
7247 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7248 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007251 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007252 Py_DECREF(str1);
7253 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255
7256 result = replace(self, str1, str2, maxcount);
7257
7258 Py_DECREF(str1);
7259 Py_DECREF(str2);
7260 return result;
7261}
7262
7263static
7264PyObject *unicode_repr(PyObject *unicode)
7265{
7266 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007267 PyUnicode_GET_SIZE(unicode),
7268 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269}
7270
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007271PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007272 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273\n\
7274Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007275such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276arguments start and end are interpreted as in slice notation.\n\
7277\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007278Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279
7280static PyObject *
7281unicode_rfind(PyUnicodeObject *self, PyObject *args)
7282{
Jesus Cea44e81682011-04-20 16:39:15 +02007283 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007284 Py_ssize_t start;
7285 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007286 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287
Jesus Cea44e81682011-04-20 16:39:15 +02007288 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7289 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007292 result = stringlib_rfind_slice(
7293 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7294 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7295 start, end
7296 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
7298 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007299
7300 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301}
7302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007303PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007304 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007306Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307
7308static PyObject *
7309unicode_rindex(PyUnicodeObject *self, PyObject *args)
7310{
Jesus Cea44e81682011-04-20 16:39:15 +02007311 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007312 Py_ssize_t start;
7313 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007314 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315
Jesus Cea44e81682011-04-20 16:39:15 +02007316 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7317 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007318 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007320 result = stringlib_rfind_slice(
7321 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7322 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7323 start, end
7324 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325
7326 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007327
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328 if (result < 0) {
7329 PyErr_SetString(PyExc_ValueError, "substring not found");
7330 return NULL;
7331 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007332 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333}
7334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007335PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007336 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007338Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007339done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340
7341static PyObject *
7342unicode_rjust(PyUnicodeObject *self, PyObject *args)
7343{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007344 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007345 Py_UNICODE fillchar = ' ';
7346
Martin v. Löwis412fb672006-04-13 06:34:32 +00007347 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 return NULL;
7349
Tim Peters7a29bd52001-09-12 03:03:31 +00007350 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 Py_INCREF(self);
7352 return (PyObject*) self;
7353 }
7354
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007355 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356}
7357
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007359unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360{
7361 /* standard clamping */
7362 if (start < 0)
7363 start = 0;
7364 if (end < 0)
7365 end = 0;
7366 if (end > self->length)
7367 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007368 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369 /* full slice, return original string */
7370 Py_INCREF(self);
7371 return (PyObject*) self;
7372 }
7373 if (start > end)
7374 start = end;
7375 /* copy slice */
7376 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007377 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378}
7379
7380PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007381 PyObject *sep,
7382 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383{
7384 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007385
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 s = PyUnicode_FromObject(s);
7387 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007388 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007389 if (sep != NULL) {
7390 sep = PyUnicode_FromObject(sep);
7391 if (sep == NULL) {
7392 Py_DECREF(s);
7393 return NULL;
7394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395 }
7396
7397 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7398
7399 Py_DECREF(s);
7400 Py_XDECREF(sep);
7401 return result;
7402}
7403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007404PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007405 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406\n\
7407Return a list of the words in S, using sep as the\n\
7408delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007409splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007410whitespace string is a separator and empty strings are\n\
7411removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412
7413static PyObject*
7414unicode_split(PyUnicodeObject *self, PyObject *args)
7415{
7416 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007417 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418
Martin v. Löwis18e16552006-02-15 17:27:45 +00007419 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 return NULL;
7421
7422 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007423 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007425 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007427 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428}
7429
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007430PyObject *
7431PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7432{
7433 PyObject* str_obj;
7434 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007435 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007436
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007437 str_obj = PyUnicode_FromObject(str_in);
7438 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007439 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007440 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007441 if (!sep_obj) {
7442 Py_DECREF(str_obj);
7443 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007444 }
7445
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007446 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007447 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7448 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7449 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007450
Fredrik Lundhb9479482006-05-26 17:22:38 +00007451 Py_DECREF(sep_obj);
7452 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007453
7454 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007455}
7456
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007457
7458PyObject *
7459PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7460{
7461 PyObject* str_obj;
7462 PyObject* sep_obj;
7463 PyObject* out;
7464
7465 str_obj = PyUnicode_FromObject(str_in);
7466 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007467 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007468 sep_obj = PyUnicode_FromObject(sep_in);
7469 if (!sep_obj) {
7470 Py_DECREF(str_obj);
7471 return NULL;
7472 }
7473
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007474 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007475 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7476 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7477 );
7478
7479 Py_DECREF(sep_obj);
7480 Py_DECREF(str_obj);
7481
7482 return out;
7483}
7484
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007485PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007486 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007487\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007488Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007489the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007490found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007491
7492static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007493unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007494{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007495 return PyUnicode_Partition((PyObject *)self, separator);
7496}
7497
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007498PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007499 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007500\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007501Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007502the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007503separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007504
7505static PyObject*
7506unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7507{
7508 return PyUnicode_RPartition((PyObject *)self, separator);
7509}
7510
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007511PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007512 PyObject *sep,
7513 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007514{
7515 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007516
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007517 s = PyUnicode_FromObject(s);
7518 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007519 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007520 if (sep != NULL) {
7521 sep = PyUnicode_FromObject(sep);
7522 if (sep == NULL) {
7523 Py_DECREF(s);
7524 return NULL;
7525 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007526 }
7527
7528 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7529
7530 Py_DECREF(s);
7531 Py_XDECREF(sep);
7532 return result;
7533}
7534
7535PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007536 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007537\n\
7538Return a list of the words in S, using sep as the\n\
7539delimiter string, starting at the end of the string and\n\
7540working to the front. If maxsplit is given, at most maxsplit\n\
7541splits are done. If sep is not specified, any whitespace string\n\
7542is a separator.");
7543
7544static PyObject*
7545unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7546{
7547 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007548 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007549
Martin v. Löwis18e16552006-02-15 17:27:45 +00007550 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007551 return NULL;
7552
7553 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007554 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007555 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007556 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007557 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007558 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007559}
7560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007561PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007562 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563\n\
7564Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007565Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007566is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567
7568static PyObject*
7569unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7570{
Guido van Rossum86662912000-04-11 15:38:46 +00007571 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
Guido van Rossum86662912000-04-11 15:38:46 +00007573 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 return NULL;
7575
Guido van Rossum86662912000-04-11 15:38:46 +00007576 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577}
7578
7579static
7580PyObject *unicode_str(PyUnicodeObject *self)
7581{
Fred Drakee4315f52000-05-09 19:53:39 +00007582 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583}
7584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007585PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007586 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587\n\
7588Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007589and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590
7591static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007592unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 return fixup(self, fixswapcase);
7595}
7596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007597PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007598 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599\n\
7600Return a copy of the string S, where all characters have been mapped\n\
7601through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007602Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7603Unmapped characters are left untouched. Characters mapped to None\n\
7604are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605
7606static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007607unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608{
Tim Petersced69f82003-09-16 20:30:58 +00007609 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007610 self->length,
7611 table,
7612 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613}
7614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007615PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007616 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007618Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619
7620static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007621unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623 return fixup(self, fixupper);
7624}
7625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007626PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007627 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628\n\
Georg Brandl98064072008-09-09 19:26:00 +00007629Pad a numeric string S with zeros on the left, to fill a field\n\
7630of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631
7632static PyObject *
7633unicode_zfill(PyUnicodeObject *self, PyObject *args)
7634{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007635 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 PyUnicodeObject *u;
7637
Martin v. Löwis18e16552006-02-15 17:27:45 +00007638 Py_ssize_t width;
7639 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640 return NULL;
7641
7642 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007643 if (PyUnicode_CheckExact(self)) {
7644 Py_INCREF(self);
7645 return (PyObject*) self;
7646 }
7647 else
7648 return PyUnicode_FromUnicode(
7649 PyUnicode_AS_UNICODE(self),
7650 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007651 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652 }
7653
7654 fill = width - self->length;
7655
7656 u = pad(self, fill, 0, '0');
7657
Walter Dörwald068325e2002-04-15 13:36:47 +00007658 if (u == NULL)
7659 return NULL;
7660
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661 if (u->str[fill] == '+' || u->str[fill] == '-') {
7662 /* move sign to beginning of string */
7663 u->str[0] = u->str[fill];
7664 u->str[fill] = '0';
7665 }
7666
7667 return (PyObject*) u;
7668}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669
7670#if 0
7671static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007672free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007674 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675}
7676#endif
7677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007678PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007679 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007681Return True if S starts with the specified prefix, False otherwise.\n\
7682With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007683With optional end, stop comparing S at that position.\n\
7684prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685
7686static PyObject *
7687unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007688 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689{
Georg Brandl24250812006-06-09 18:45:48 +00007690 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007692 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007693 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007694 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695
Jesus Cea44e81682011-04-20 16:39:15 +02007696 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007697 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007698 if (PyTuple_Check(subobj)) {
7699 Py_ssize_t i;
7700 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7701 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007702 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007703 if (substring == NULL)
7704 return NULL;
7705 result = tailmatch(self, substring, start, end, -1);
7706 Py_DECREF(substring);
7707 if (result) {
7708 Py_RETURN_TRUE;
7709 }
7710 }
7711 /* nothing matched */
7712 Py_RETURN_FALSE;
7713 }
7714 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007715 if (substring == NULL) {
7716 if (PyErr_ExceptionMatches(PyExc_TypeError))
7717 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7718 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007719 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007720 }
Georg Brandl24250812006-06-09 18:45:48 +00007721 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007723 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724}
7725
7726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007727PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007728 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007730Return True if S ends with the specified suffix, False otherwise.\n\
7731With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007732With optional end, stop comparing S at that position.\n\
7733suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734
7735static PyObject *
7736unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007737 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738{
Georg Brandl24250812006-06-09 18:45:48 +00007739 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007741 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007742 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007743 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744
Jesus Cea44e81682011-04-20 16:39:15 +02007745 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007746 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007747 if (PyTuple_Check(subobj)) {
7748 Py_ssize_t i;
7749 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7750 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007751 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007752 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007753 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007754 result = tailmatch(self, substring, start, end, +1);
7755 Py_DECREF(substring);
7756 if (result) {
7757 Py_RETURN_TRUE;
7758 }
7759 }
7760 Py_RETURN_FALSE;
7761 }
7762 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007763 if (substring == NULL) {
7764 if (PyErr_ExceptionMatches(PyExc_TypeError))
7765 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7766 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007767 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007768 }
Georg Brandl24250812006-06-09 18:45:48 +00007769 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007771 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772}
7773
7774
Eric Smitha9f7d622008-02-17 19:46:49 +00007775/* Implements do_string_format, which is unicode because of stringlib */
7776#include "stringlib/string_format.h"
7777
7778PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007779 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007780\n\
Eric Smith6c840852010-11-06 19:43:44 +00007781Return a formatted version of S, using substitutions from args and kwargs.\n\
7782The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007783
Eric Smithdc13b792008-05-30 18:10:04 +00007784static PyObject *
7785unicode__format__(PyObject *self, PyObject *args)
7786{
7787 PyObject *format_spec;
7788 PyObject *result = NULL;
7789 PyObject *tmp = NULL;
7790
7791 /* If 2.x, convert format_spec to the same type as value */
7792 /* This is to allow things like u''.format('') */
7793 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7794 goto done;
7795 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7796 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007797 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007798 goto done;
7799 }
7800 tmp = PyObject_Unicode(format_spec);
7801 if (tmp == NULL)
7802 goto done;
7803 format_spec = tmp;
7804
7805 result = _PyUnicode_FormatAdvanced(self,
7806 PyUnicode_AS_UNICODE(format_spec),
7807 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007808 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007809 Py_XDECREF(tmp);
7810 return result;
7811}
7812
Eric Smitha9f7d622008-02-17 19:46:49 +00007813PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007814 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007815\n\
Eric Smith6c840852010-11-06 19:43:44 +00007816Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007817
Robert Schuppenies901c9972008-06-10 10:10:31 +00007818static PyObject *
7819unicode__sizeof__(PyUnicodeObject *v)
7820{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007821 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7822 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007823}
7824
7825PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007826 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007827\n\
7828");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007829
7830static PyObject *
7831unicode_getnewargs(PyUnicodeObject *v)
7832{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007833 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007834}
7835
7836
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007838 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007839 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7840 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007841 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007842 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7843 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7844 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7845 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7846 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7847 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7848 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007849 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007850 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7851 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7852 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007853 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007854 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007855/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7856 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7857 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7858 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007859 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007860 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007861 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007862 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007863 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7864 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7865 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7866 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7867 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7868 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7869 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7870 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7871 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7872 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7873 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7874 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7875 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7876 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007877 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007878 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7879 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7880 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7881 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007882 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007883#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007884 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885#endif
7886
7887#if 0
7888 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007889 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890#endif
7891
Benjamin Peterson857ce152009-01-31 16:29:18 +00007892 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893 {NULL, NULL}
7894};
7895
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007896static PyObject *
7897unicode_mod(PyObject *v, PyObject *w)
7898{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007899 if (!PyUnicode_Check(v)) {
7900 Py_INCREF(Py_NotImplemented);
7901 return Py_NotImplemented;
7902 }
7903 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007904}
7905
7906static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007907 0, /*nb_add*/
7908 0, /*nb_subtract*/
7909 0, /*nb_multiply*/
7910 0, /*nb_divide*/
7911 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007912};
7913
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007915 (lenfunc) unicode_length, /* sq_length */
7916 PyUnicode_Concat, /* sq_concat */
7917 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7918 (ssizeargfunc) unicode_getitem, /* sq_item */
7919 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7920 0, /* sq_ass_item */
7921 0, /* sq_ass_slice */
7922 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923};
7924
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007925static PyObject*
7926unicode_subscript(PyUnicodeObject* self, PyObject* item)
7927{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007928 if (PyIndex_Check(item)) {
7929 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007930 if (i == -1 && PyErr_Occurred())
7931 return NULL;
7932 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007933 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007934 return unicode_getitem(self, i);
7935 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007936 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007937 Py_UNICODE* source_buf;
7938 Py_UNICODE* result_buf;
7939 PyObject* result;
7940
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007941 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007942 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007943 return NULL;
7944 }
7945
7946 if (slicelength <= 0) {
7947 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007948 } else if (start == 0 && step == 1 && slicelength == self->length &&
7949 PyUnicode_CheckExact(self)) {
7950 Py_INCREF(self);
7951 return (PyObject *)self;
7952 } else if (step == 1) {
7953 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007954 } else {
7955 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007956 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7957 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007958
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007959 if (result_buf == NULL)
7960 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007961
7962 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7963 result_buf[i] = source_buf[cur];
7964 }
Tim Petersced69f82003-09-16 20:30:58 +00007965
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007966 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007967 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007968 return result;
7969 }
7970 } else {
7971 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7972 return NULL;
7973 }
7974}
7975
7976static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007977 (lenfunc)unicode_length, /* mp_length */
7978 (binaryfunc)unicode_subscript, /* mp_subscript */
7979 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007980};
7981
Martin v. Löwis18e16552006-02-15 17:27:45 +00007982static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007984 Py_ssize_t index,
7985 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986{
7987 if (index != 0) {
7988 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007989 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 return -1;
7991 }
7992 *ptr = (void *) self->str;
7993 return PyUnicode_GET_DATA_SIZE(self);
7994}
7995
Martin v. Löwis18e16552006-02-15 17:27:45 +00007996static Py_ssize_t
7997unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007998 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999{
8000 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008001 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 return -1;
8003}
8004
8005static int
8006unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008007 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008{
8009 if (lenp)
8010 *lenp = PyUnicode_GET_DATA_SIZE(self);
8011 return 1;
8012}
8013
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008014static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008016 Py_ssize_t index,
8017 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018{
8019 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008020
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 if (index != 0) {
8022 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008023 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 return -1;
8025 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008026 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008028 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008029 *ptr = (void *) PyString_AS_STRING(str);
8030 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031}
8032
8033/* Helpers for PyUnicode_Format() */
8034
8035static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008036getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008038 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008040 (*p_argidx)++;
8041 if (arglen < 0)
8042 return args;
8043 else
8044 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 }
8046 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008047 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 return NULL;
8049}
8050
8051#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008052#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008054#define F_ALT (1<<3)
8055#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056
Martin v. Löwis18e16552006-02-15 17:27:45 +00008057static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008058strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008060 register Py_ssize_t i;
8061 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008063 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 return len;
8066}
8067
Neal Norwitzfc76d632006-01-10 06:03:13 +00008068static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008069longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8070{
Tim Peters15231542006-02-16 01:08:01 +00008071 Py_ssize_t result;
8072
Neal Norwitzfc76d632006-01-10 06:03:13 +00008073 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008074 result = strtounicode(buffer, (char *)buffer);
8075 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008076}
8077
Guido van Rossum078151d2002-08-11 04:24:12 +00008078/* XXX To save some code duplication, formatfloat/long/int could have been
8079 shared with stringobject.c, converting from 8-bit to Unicode after the
8080 formatting is done. */
8081
Mark Dickinson18cfada2009-11-23 18:46:41 +00008082/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8083
8084static PyObject *
8085formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008087 char *p;
8088 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008090
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 x = PyFloat_AsDouble(v);
8092 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008093 return NULL;
8094
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008096 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008097
Mark Dickinson18cfada2009-11-23 18:46:41 +00008098 p = PyOS_double_to_string(x, type, prec,
8099 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8100 if (p == NULL)
8101 return NULL;
8102 result = PyUnicode_FromStringAndSize(p, strlen(p));
8103 PyMem_Free(p);
8104 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105}
8106
Tim Peters38fd5b62000-09-21 05:43:11 +00008107static PyObject*
8108formatlong(PyObject *val, int flags, int prec, int type)
8109{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008110 char *buf;
8111 int i, len;
8112 PyObject *str; /* temporary string object. */
8113 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008114
Benjamin Peterson857ce152009-01-31 16:29:18 +00008115 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8116 if (!str)
8117 return NULL;
8118 result = _PyUnicode_New(len);
8119 if (!result) {
8120 Py_DECREF(str);
8121 return NULL;
8122 }
8123 for (i = 0; i < len; i++)
8124 result->str[i] = buf[i];
8125 result->str[len] = 0;
8126 Py_DECREF(str);
8127 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008128}
8129
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130static int
8131formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008132 size_t buflen,
8133 int flags,
8134 int prec,
8135 int type,
8136 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008138 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008139 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8140 * + 1 + 1
8141 * = 24
8142 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008143 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008144 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 long x;
8146
8147 x = PyInt_AsLong(v);
8148 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008149 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008150 if (x < 0 && type == 'u') {
8151 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008152 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008153 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8154 sign = "-";
8155 else
8156 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008158 prec = 1;
8159
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008160 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8161 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008162 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008163 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008164 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008165 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008166 return -1;
8167 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008168
8169 if ((flags & F_ALT) &&
8170 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008171 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008172 * of issues that cause pain:
8173 * - when 0 is being converted, the C standard leaves off
8174 * the '0x' or '0X', which is inconsistent with other
8175 * %#x/%#X conversions and inconsistent with Python's
8176 * hex() function
8177 * - there are platforms that violate the standard and
8178 * convert 0 with the '0x' or '0X'
8179 * (Metrowerks, Compaq Tru64)
8180 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008181 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008182 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008183 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008184 * We can achieve the desired consistency by inserting our
8185 * own '0x' or '0X' prefix, and substituting %x/%X in place
8186 * of %#x/%#X.
8187 *
8188 * Note that this is the same approach as used in
8189 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008190 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008191 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8192 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008193 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008194 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008195 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8196 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008197 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008198 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008199 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008200 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008201 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008202 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203}
8204
8205static int
8206formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008207 size_t buflen,
8208 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209{
Ezio Melotti32125152010-02-25 17:36:04 +00008210 PyObject *unistr;
8211 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008212 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008213 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008214 if (PyUnicode_GET_SIZE(v) != 1)
8215 goto onError;
8216 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008217 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008219 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008220 if (PyString_GET_SIZE(v) != 1)
8221 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008222 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8223 with a UnicodeDecodeError if 'char' is not decodable with the
8224 default encoding (usually ASCII, but it might be something else) */
8225 str = PyString_AS_STRING(v);
8226 if ((unsigned char)str[0] > 0x7F) {
8227 /* the char is not ASCII; try to decode the string using the
8228 default encoding and return -1 to let the UnicodeDecodeError
8229 be raised if the string can't be decoded */
8230 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8231 if (unistr == NULL)
8232 return -1;
8233 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8234 Py_DECREF(unistr);
8235 }
8236 else
8237 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239
8240 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008241 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008243 x = PyInt_AsLong(v);
8244 if (x == -1 && PyErr_Occurred())
8245 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008246#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008247 if (x < 0 || x > 0x10ffff) {
8248 PyErr_SetString(PyExc_OverflowError,
8249 "%c arg not in range(0x110000) "
8250 "(wide Python build)");
8251 return -1;
8252 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008253#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008254 if (x < 0 || x > 0xffff) {
8255 PyErr_SetString(PyExc_OverflowError,
8256 "%c arg not in range(0x10000) "
8257 "(narrow Python build)");
8258 return -1;
8259 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008260#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008261 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 }
8263 buf[1] = '\0';
8264 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008265
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008266 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008267 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008268 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008269 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270}
8271
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008272/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8273
Mark Dickinson18cfada2009-11-23 18:46:41 +00008274 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008275 chars are formatted. XXX This is a magic number. Each formatting
8276 routine does bounds checking to ensure no overflow, but a better
8277 solution may be to malloc a buffer of appropriate size for each
8278 format. For now, the current solution is sufficient.
8279*/
8280#define FORMATBUFLEN (size_t)120
8281
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008283 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284{
8285 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008286 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 int args_owned = 0;
8288 PyUnicodeObject *result = NULL;
8289 PyObject *dict = NULL;
8290 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008291
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008293 PyErr_BadInternalCall();
8294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 }
8296 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008297 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 fmt = PyUnicode_AS_UNICODE(uformat);
8300 fmtcnt = PyUnicode_GET_SIZE(uformat);
8301
8302 reslen = rescnt = fmtcnt + 100;
8303 result = _PyUnicode_New(reslen);
8304 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008305 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 res = PyUnicode_AS_UNICODE(result);
8307
8308 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008309 arglen = PyTuple_Size(args);
8310 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 }
8312 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008313 arglen = -1;
8314 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 }
Benjamin Peterson23d49d32012-08-28 17:55:35 -04008316 if (PyMapping_Check(args) && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008317 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008318 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319
8320 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008321 if (*fmt != '%') {
8322 if (--rescnt < 0) {
8323 rescnt = fmtcnt + 100;
8324 reslen += rescnt;
8325 if (_PyUnicode_Resize(&result, reslen) < 0)
8326 goto onError;
8327 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8328 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008329 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008330 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008331 }
8332 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008333 /* Got a format specifier */
8334 int flags = 0;
8335 Py_ssize_t width = -1;
8336 int prec = -1;
8337 Py_UNICODE c = '\0';
8338 Py_UNICODE fill;
8339 int isnumok;
8340 PyObject *v = NULL;
8341 PyObject *temp = NULL;
8342 Py_UNICODE *pbuf;
8343 Py_UNICODE sign;
8344 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008345 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008346
8347 fmt++;
8348 if (*fmt == '(') {
8349 Py_UNICODE *keystart;
8350 Py_ssize_t keylen;
8351 PyObject *key;
8352 int pcount = 1;
8353
8354 if (dict == NULL) {
8355 PyErr_SetString(PyExc_TypeError,
8356 "format requires a mapping");
8357 goto onError;
8358 }
8359 ++fmt;
8360 --fmtcnt;
8361 keystart = fmt;
8362 /* Skip over balanced parentheses */
8363 while (pcount > 0 && --fmtcnt >= 0) {
8364 if (*fmt == ')')
8365 --pcount;
8366 else if (*fmt == '(')
8367 ++pcount;
8368 fmt++;
8369 }
8370 keylen = fmt - keystart - 1;
8371 if (fmtcnt < 0 || pcount > 0) {
8372 PyErr_SetString(PyExc_ValueError,
8373 "incomplete format key");
8374 goto onError;
8375 }
8376#if 0
8377 /* keys are converted to strings using UTF-8 and
8378 then looked up since Python uses strings to hold
8379 variables names etc. in its namespaces and we
8380 wouldn't want to break common idioms. */
8381 key = PyUnicode_EncodeUTF8(keystart,
8382 keylen,
8383 NULL);
8384#else
8385 key = PyUnicode_FromUnicode(keystart, keylen);
8386#endif
8387 if (key == NULL)
8388 goto onError;
8389 if (args_owned) {
8390 Py_DECREF(args);
8391 args_owned = 0;
8392 }
8393 args = PyObject_GetItem(dict, key);
8394 Py_DECREF(key);
8395 if (args == NULL) {
8396 goto onError;
8397 }
8398 args_owned = 1;
8399 arglen = -1;
8400 argidx = -2;
8401 }
8402 while (--fmtcnt >= 0) {
8403 switch (c = *fmt++) {
8404 case '-': flags |= F_LJUST; continue;
8405 case '+': flags |= F_SIGN; continue;
8406 case ' ': flags |= F_BLANK; continue;
8407 case '#': flags |= F_ALT; continue;
8408 case '0': flags |= F_ZERO; continue;
8409 }
8410 break;
8411 }
8412 if (c == '*') {
8413 v = getnextarg(args, arglen, &argidx);
8414 if (v == NULL)
8415 goto onError;
8416 if (!PyInt_Check(v)) {
8417 PyErr_SetString(PyExc_TypeError,
8418 "* wants int");
8419 goto onError;
8420 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008421 width = PyInt_AsSsize_t(v);
8422 if (width == -1 && PyErr_Occurred())
8423 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008424 if (width < 0) {
8425 flags |= F_LJUST;
8426 width = -width;
8427 }
8428 if (--fmtcnt >= 0)
8429 c = *fmt++;
8430 }
8431 else if (c >= '0' && c <= '9') {
8432 width = c - '0';
8433 while (--fmtcnt >= 0) {
8434 c = *fmt++;
8435 if (c < '0' || c > '9')
8436 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008437 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008438 PyErr_SetString(PyExc_ValueError,
8439 "width too big");
8440 goto onError;
8441 }
8442 width = width*10 + (c - '0');
8443 }
8444 }
8445 if (c == '.') {
8446 prec = 0;
8447 if (--fmtcnt >= 0)
8448 c = *fmt++;
8449 if (c == '*') {
8450 v = getnextarg(args, arglen, &argidx);
8451 if (v == NULL)
8452 goto onError;
8453 if (!PyInt_Check(v)) {
8454 PyErr_SetString(PyExc_TypeError,
8455 "* wants int");
8456 goto onError;
8457 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008458 prec = _PyInt_AsInt(v);
8459 if (prec == -1 && PyErr_Occurred())
8460 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008461 if (prec < 0)
8462 prec = 0;
8463 if (--fmtcnt >= 0)
8464 c = *fmt++;
8465 }
8466 else if (c >= '0' && c <= '9') {
8467 prec = c - '0';
8468 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008469 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008470 if (c < '0' || c > '9')
8471 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008472 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008473 PyErr_SetString(PyExc_ValueError,
8474 "prec too big");
8475 goto onError;
8476 }
8477 prec = prec*10 + (c - '0');
8478 }
8479 }
8480 } /* prec */
8481 if (fmtcnt >= 0) {
8482 if (c == 'h' || c == 'l' || c == 'L') {
8483 if (--fmtcnt >= 0)
8484 c = *fmt++;
8485 }
8486 }
8487 if (fmtcnt < 0) {
8488 PyErr_SetString(PyExc_ValueError,
8489 "incomplete format");
8490 goto onError;
8491 }
8492 if (c != '%') {
8493 v = getnextarg(args, arglen, &argidx);
8494 if (v == NULL)
8495 goto onError;
8496 }
8497 sign = 0;
8498 fill = ' ';
8499 switch (c) {
8500
8501 case '%':
8502 pbuf = formatbuf;
8503 /* presume that buffer length is at least 1 */
8504 pbuf[0] = '%';
8505 len = 1;
8506 break;
8507
8508 case 's':
8509 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008510 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008511 temp = v;
8512 Py_INCREF(temp);
8513 }
8514 else {
8515 PyObject *unicode;
8516 if (c == 's')
8517 temp = PyObject_Unicode(v);
8518 else
8519 temp = PyObject_Repr(v);
8520 if (temp == NULL)
8521 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008522 if (PyUnicode_Check(temp))
8523 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008524 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008525 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008526 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8527 PyString_GET_SIZE(temp),
8528 NULL,
8529 "strict");
8530 Py_DECREF(temp);
8531 temp = unicode;
8532 if (temp == NULL)
8533 goto onError;
8534 }
8535 else {
8536 Py_DECREF(temp);
8537 PyErr_SetString(PyExc_TypeError,
8538 "%s argument has non-string str()");
8539 goto onError;
8540 }
8541 }
8542 pbuf = PyUnicode_AS_UNICODE(temp);
8543 len = PyUnicode_GET_SIZE(temp);
8544 if (prec >= 0 && len > prec)
8545 len = prec;
8546 break;
8547
8548 case 'i':
8549 case 'd':
8550 case 'u':
8551 case 'o':
8552 case 'x':
8553 case 'X':
8554 if (c == 'i')
8555 c = 'd';
8556 isnumok = 0;
8557 if (PyNumber_Check(v)) {
8558 PyObject *iobj=NULL;
8559
8560 if (PyInt_Check(v) || (PyLong_Check(v))) {
8561 iobj = v;
8562 Py_INCREF(iobj);
8563 }
8564 else {
8565 iobj = PyNumber_Int(v);
8566 if (iobj==NULL) iobj = PyNumber_Long(v);
8567 }
8568 if (iobj!=NULL) {
8569 if (PyInt_Check(iobj)) {
8570 isnumok = 1;
8571 pbuf = formatbuf;
8572 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8573 flags, prec, c, iobj);
8574 Py_DECREF(iobj);
8575 if (len < 0)
8576 goto onError;
8577 sign = 1;
8578 }
8579 else if (PyLong_Check(iobj)) {
8580 isnumok = 1;
8581 temp = formatlong(iobj, flags, prec, c);
8582 Py_DECREF(iobj);
8583 if (!temp)
8584 goto onError;
8585 pbuf = PyUnicode_AS_UNICODE(temp);
8586 len = PyUnicode_GET_SIZE(temp);
8587 sign = 1;
8588 }
8589 else {
8590 Py_DECREF(iobj);
8591 }
8592 }
8593 }
8594 if (!isnumok) {
8595 PyErr_Format(PyExc_TypeError,
8596 "%%%c format: a number is required, "
8597 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8598 goto onError;
8599 }
8600 if (flags & F_ZERO)
8601 fill = '0';
8602 break;
8603
8604 case 'e':
8605 case 'E':
8606 case 'f':
8607 case 'F':
8608 case 'g':
8609 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008610 temp = formatfloat(v, flags, prec, c);
8611 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008612 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008613 pbuf = PyUnicode_AS_UNICODE(temp);
8614 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008615 sign = 1;
8616 if (flags & F_ZERO)
8617 fill = '0';
8618 break;
8619
8620 case 'c':
8621 pbuf = formatbuf;
8622 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8623 if (len < 0)
8624 goto onError;
8625 break;
8626
8627 default:
8628 PyErr_Format(PyExc_ValueError,
8629 "unsupported format character '%c' (0x%x) "
8630 "at index %zd",
8631 (31<=c && c<=126) ? (char)c : '?',
8632 (int)c,
8633 (Py_ssize_t)(fmt - 1 -
8634 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008635 goto onError;
8636 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008637 if (sign) {
8638 if (*pbuf == '-' || *pbuf == '+') {
8639 sign = *pbuf++;
8640 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008641 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008642 else if (flags & F_SIGN)
8643 sign = '+';
8644 else if (flags & F_BLANK)
8645 sign = ' ';
8646 else
8647 sign = 0;
8648 }
8649 if (width < len)
8650 width = len;
8651 if (rescnt - (sign != 0) < width) {
8652 reslen -= rescnt;
8653 rescnt = width + fmtcnt + 100;
8654 reslen += rescnt;
8655 if (reslen < 0) {
8656 Py_XDECREF(temp);
8657 PyErr_NoMemory();
8658 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008659 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008660 if (_PyUnicode_Resize(&result, reslen) < 0) {
8661 Py_XDECREF(temp);
8662 goto onError;
8663 }
8664 res = PyUnicode_AS_UNICODE(result)
8665 + reslen - rescnt;
8666 }
8667 if (sign) {
8668 if (fill != ' ')
8669 *res++ = sign;
8670 rescnt--;
8671 if (width > len)
8672 width--;
8673 }
8674 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8675 assert(pbuf[0] == '0');
8676 assert(pbuf[1] == c);
8677 if (fill != ' ') {
8678 *res++ = *pbuf++;
8679 *res++ = *pbuf++;
8680 }
8681 rescnt -= 2;
8682 width -= 2;
8683 if (width < 0)
8684 width = 0;
8685 len -= 2;
8686 }
8687 if (width > len && !(flags & F_LJUST)) {
8688 do {
8689 --rescnt;
8690 *res++ = fill;
8691 } while (--width > len);
8692 }
8693 if (fill == ' ') {
8694 if (sign)
8695 *res++ = sign;
8696 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8697 assert(pbuf[0] == '0');
8698 assert(pbuf[1] == c);
8699 *res++ = *pbuf++;
8700 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008701 }
8702 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008703 Py_UNICODE_COPY(res, pbuf, len);
8704 res += len;
8705 rescnt -= len;
8706 while (--width >= len) {
8707 --rescnt;
8708 *res++ = ' ';
8709 }
8710 if (dict && (argidx < arglen) && c != '%') {
8711 PyErr_SetString(PyExc_TypeError,
8712 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008713 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008714 goto onError;
8715 }
8716 Py_XDECREF(temp);
8717 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718 } /* until end */
8719 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008720 PyErr_SetString(PyExc_TypeError,
8721 "not all arguments converted during string formatting");
8722 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 }
8724
Thomas Woutersa96affe2006-03-12 00:29:36 +00008725 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008726 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008728 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729 }
8730 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 return (PyObject *)result;
8732
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008733 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734 Py_XDECREF(result);
8735 Py_DECREF(uformat);
8736 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008737 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738 }
8739 return NULL;
8740}
8741
8742static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008743 (readbufferproc) unicode_buffer_getreadbuf,
8744 (writebufferproc) unicode_buffer_getwritebuf,
8745 (segcountproc) unicode_buffer_getsegcount,
8746 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747};
8748
Jeremy Hylton938ace62002-07-17 16:30:39 +00008749static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008750unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8751
Tim Peters6d6c1a32001-08-02 04:15:00 +00008752static PyObject *
8753unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8754{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008755 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008756 static char *kwlist[] = {"string", "encoding", "errors", 0};
8757 char *encoding = NULL;
8758 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008759
Benjamin Peterson857ce152009-01-31 16:29:18 +00008760 if (type != &PyUnicode_Type)
8761 return unicode_subtype_new(type, args, kwds);
8762 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008763 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008764 return NULL;
8765 if (x == NULL)
8766 return (PyObject *)_PyUnicode_New(0);
8767 if (encoding == NULL && errors == NULL)
8768 return PyObject_Unicode(x);
8769 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008770 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008771}
8772
Guido van Rossume023fe02001-08-30 03:12:59 +00008773static PyObject *
8774unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8775{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008776 PyUnicodeObject *tmp, *pnew;
8777 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008778
Benjamin Peterson857ce152009-01-31 16:29:18 +00008779 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8780 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8781 if (tmp == NULL)
8782 return NULL;
8783 assert(PyUnicode_Check(tmp));
8784 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8785 if (pnew == NULL) {
8786 Py_DECREF(tmp);
8787 return NULL;
8788 }
8789 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8790 if (pnew->str == NULL) {
8791 _Py_ForgetReference((PyObject *)pnew);
8792 PyObject_Del(pnew);
8793 Py_DECREF(tmp);
8794 return PyErr_NoMemory();
8795 }
8796 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8797 pnew->length = n;
8798 pnew->hash = tmp->hash;
8799 Py_DECREF(tmp);
8800 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008801}
8802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008803PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008804 "unicode(object='') -> unicode object\n\
8805unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008806\n\
8807Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008808encoding defaults to the current default string encoding.\n\
8809errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008810
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008812 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008813 "unicode", /* tp_name */
8814 sizeof(PyUnicodeObject), /* tp_size */
8815 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008817 (destructor)unicode_dealloc, /* tp_dealloc */
8818 0, /* tp_print */
8819 0, /* tp_getattr */
8820 0, /* tp_setattr */
8821 0, /* tp_compare */
8822 unicode_repr, /* tp_repr */
8823 &unicode_as_number, /* tp_as_number */
8824 &unicode_as_sequence, /* tp_as_sequence */
8825 &unicode_as_mapping, /* tp_as_mapping */
8826 (hashfunc) unicode_hash, /* tp_hash*/
8827 0, /* tp_call*/
8828 (reprfunc) unicode_str, /* tp_str */
8829 PyObject_GenericGetAttr, /* tp_getattro */
8830 0, /* tp_setattro */
8831 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008832 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008833 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008834 unicode_doc, /* tp_doc */
8835 0, /* tp_traverse */
8836 0, /* tp_clear */
8837 PyUnicode_RichCompare, /* tp_richcompare */
8838 0, /* tp_weaklistoffset */
8839 0, /* tp_iter */
8840 0, /* tp_iternext */
8841 unicode_methods, /* tp_methods */
8842 0, /* tp_members */
8843 0, /* tp_getset */
8844 &PyBaseString_Type, /* tp_base */
8845 0, /* tp_dict */
8846 0, /* tp_descr_get */
8847 0, /* tp_descr_set */
8848 0, /* tp_dictoffset */
8849 0, /* tp_init */
8850 0, /* tp_alloc */
8851 unicode_new, /* tp_new */
8852 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853};
8854
8855/* Initialize the Unicode implementation */
8856
Thomas Wouters78890102000-07-22 19:25:51 +00008857void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008859 /* XXX - move this array to unicodectype.c ? */
8860 Py_UNICODE linebreak[] = {
8861 0x000A, /* LINE FEED */
8862 0x000D, /* CARRIAGE RETURN */
8863 0x001C, /* FILE SEPARATOR */
8864 0x001D, /* GROUP SEPARATOR */
8865 0x001E, /* RECORD SEPARATOR */
8866 0x0085, /* NEXT LINE */
8867 0x2028, /* LINE SEPARATOR */
8868 0x2029, /* PARAGRAPH SEPARATOR */
8869 };
8870
Fred Drakee4315f52000-05-09 19:53:39 +00008871 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008872 if (!unicode_empty) {
8873 unicode_empty = _PyUnicode_New(0);
8874 if (!unicode_empty)
8875 return;
8876 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008877
Guido van Rossumcacfc072002-05-24 19:01:59 +00008878 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008879 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008880
8881 /* initialize the linebreak bloom filter */
8882 bloom_linebreak = make_bloom_mask(
8883 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8884 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008885
8886 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008887
8888 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8889 Py_FatalError("Can't initialize field name iterator type");
8890
8891 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8892 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893}
8894
8895/* Finalize the Unicode implementation */
8896
Christian Heimes3b718a72008-02-14 12:47:33 +00008897int
8898PyUnicode_ClearFreeList(void)
8899{
8900 int freelist_size = numfree;
8901 PyUnicodeObject *u;
8902
8903 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008904 PyUnicodeObject *v = u;
8905 u = *(PyUnicodeObject **)u;
8906 if (v->str)
8907 PyObject_DEL(v->str);
8908 Py_XDECREF(v->defenc);
8909 PyObject_Del(v);
8910 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008911 }
8912 free_list = NULL;
8913 assert(numfree == 0);
8914 return freelist_size;
8915}
8916
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917void
Thomas Wouters78890102000-07-22 19:25:51 +00008918_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008920 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008922 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008923
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008924 for (i = 0; i < 256; i++)
8925 Py_CLEAR(unicode_latin1[i]);
8926
Christian Heimes3b718a72008-02-14 12:47:33 +00008927 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008929
Anthony Baxterac6bd462006-04-13 02:06:09 +00008930#ifdef __cplusplus
8931}
8932#endif