blob: 2925651f0e8dfd62334f8114c16a209817156e4b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Serhiy Storchakae822b032013-08-06 16:56:26 +0300550/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551 * by 'ptr', possibly combining surrogate pairs on narrow builds.
552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553 * that should be returned and 'end' pointing to the end of the buffer.
554 * ('end' is used on narrow builds to detect a lone surrogate at the
555 * end of the buffer that should be returned unchanged.)
556 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557 * The type of the returned char is always Py_UCS4.
558 *
559 * Note: the macro advances ptr to next char, so it might have side-effects
560 * (especially if used with other macros).
561 */
562
563/* helper macros used by _Py_UNICODE_NEXT */
564#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566/* Join two surrogate characters and return a single Py_UCS4 value. */
567#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
570
571#ifdef Py_UNICODE_WIDE
572#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
573#else
574#define _Py_UNICODE_NEXT(ptr, end) \
575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
578 (Py_UCS4)*(ptr)++)
579#endif
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581#ifdef HAVE_WCHAR_H
582
Mark Dickinson6b265f12009-03-18 16:07:26 +0000583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594{
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000643 PyErr_BadInternalCall();
644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 }
646
647 unicode = _PyUnicode_New(size);
648 if (!unicode)
649 return NULL;
650
651 /* Copy the wchar_t data into the new object */
652#ifdef HAVE_USABLE_WCHAR_T
653 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000654#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000656 register Py_UNICODE *u;
657 register Py_ssize_t i;
658 u = PyUnicode_AS_UNICODE(unicode);
659 for (i = size; i > 0; i--)
660 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662#endif
663
664 return (PyObject *)unicode;
665}
666
Mark Dickinson6b265f12009-03-18 16:07:26 +0000667#endif /* CONVERT_WCHAR_TO_SURROGATES */
668
669#undef CONVERT_WCHAR_TO_SURROGATES
670
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000671static void
672makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
673{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000674 *fmt++ = '%';
675 if (width) {
676 if (zeropad)
677 *fmt++ = '0';
678 fmt += sprintf(fmt, "%d", width);
679 }
680 if (precision)
681 fmt += sprintf(fmt, ".%d", precision);
682 if (longflag)
683 *fmt++ = 'l';
684 else if (size_tflag) {
685 char *f = PY_FORMAT_SIZE_T;
686 while (*f)
687 *fmt++ = *f++;
688 }
689 *fmt++ = c;
690 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000691}
692
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200693#define appendstring(string) \
694 do { \
695 for (copy = string;*copy; copy++) { \
696 *s++ = (unsigned char)*copy; \
697 } \
698 } while (0)
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000699
700PyObject *
701PyUnicode_FromFormatV(const char *format, va_list vargs)
702{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000703 va_list count;
704 Py_ssize_t callcount = 0;
705 PyObject **callresults = NULL;
706 PyObject **callresult = NULL;
707 Py_ssize_t n = 0;
708 int width = 0;
709 int precision = 0;
710 int zeropad;
711 const char* f;
712 Py_UNICODE *s;
713 PyObject *string;
714 /* used by sprintf */
715 char buffer[21];
716 /* use abuffer instead of buffer, if we need more space
717 * (which can happen if there's a format specifier with width). */
718 char *abuffer = NULL;
719 char *realbuffer;
720 Py_ssize_t abuffersize = 0;
721 char fmt[60]; /* should be enough for %0width.precisionld */
722 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000723
724#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726#else
727#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000728 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000729#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000730 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000731#endif
732#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000733 /* step 1: count the number of %S/%R/%s format specifications
734 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
735 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000736 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000737 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200738 f++;
739 while (*f && *f != '%' && !isalpha((unsigned)*f))
740 f++;
Serhiy Storchaka227526d2015-01-31 01:15:29 +0200741 if (!*f)
742 break;
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200743 if (*f == 's' || *f=='S' || *f=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000744 ++callcount;
745 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000746 }
747 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000748 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000749 if (callcount) {
750 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
751 if (!callresults) {
752 PyErr_NoMemory();
753 return NULL;
754 }
755 callresult = callresults;
756 }
757 /* step 3: figure out how large a buffer we need */
758 for (f = format; *f; f++) {
759 if (*f == '%') {
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200760 const char* p = f++;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000761 width = 0;
762 while (isdigit((unsigned)*f))
763 width = (width*10) + *f++ - '0';
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200764 precision = 0;
765 if (*f == '.') {
766 f++;
767 while (isdigit((unsigned)*f))
768 precision = (precision*10) + *f++ - '0';
769 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000770
Benjamin Peterson857ce152009-01-31 16:29:18 +0000771 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
772 * they don't affect the amount of space we reserve.
773 */
774 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000775 (f[1] == 'd' || f[1] == 'u'))
776 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000777
Benjamin Peterson857ce152009-01-31 16:29:18 +0000778 switch (*f) {
779 case 'c':
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300780 {
781 int ordinal = va_arg(count, int);
782#ifdef Py_UNICODE_WIDE
783 if (ordinal < 0 || ordinal > 0x10ffff) {
784 PyErr_SetString(PyExc_OverflowError,
785 "%c arg not in range(0x110000) "
786 "(wide Python build)");
787 goto fail;
788 }
789#else
790 if (ordinal < 0 || ordinal > 0xffff) {
791 PyErr_SetString(PyExc_OverflowError,
792 "%c arg not in range(0x10000) "
793 "(narrow Python build)");
794 goto fail;
795 }
796#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000797 /* fall through... */
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300798 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000799 case '%':
800 n++;
801 break;
802 case 'd': case 'u': case 'i': case 'x':
803 (void) va_arg(count, int);
Serhiy Storchaka0e0282e2015-01-27 22:17:56 +0200804 if (width < precision)
805 width = precision;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000806 /* 20 bytes is enough to hold a 64-bit
807 integer. Decimal takes the most space.
808 This isn't enough for octal.
809 If a width is specified we need more
810 (which we allocate later). */
811 if (width < 20)
812 width = 20;
813 n += width;
814 if (abuffersize < width)
815 abuffersize = width;
816 break;
817 case 's':
818 {
819 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000820 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000821 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
822 if (!str)
823 goto fail;
824 n += PyUnicode_GET_SIZE(str);
825 /* Remember the str and switch to the next slot */
826 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000827 break;
828 }
829 case 'U':
830 {
831 PyObject *obj = va_arg(count, PyObject *);
832 assert(obj && PyUnicode_Check(obj));
833 n += PyUnicode_GET_SIZE(obj);
834 break;
835 }
836 case 'V':
837 {
838 PyObject *obj = va_arg(count, PyObject *);
839 const char *str = va_arg(count, const char *);
840 assert(obj || str);
841 assert(!obj || PyUnicode_Check(obj));
842 if (obj)
843 n += PyUnicode_GET_SIZE(obj);
844 else
845 n += strlen(str);
846 break;
847 }
848 case 'S':
849 {
850 PyObject *obj = va_arg(count, PyObject *);
851 PyObject *str;
852 assert(obj);
853 str = PyObject_Str(obj);
854 if (!str)
855 goto fail;
Victor Stinner2af8d2f2014-07-30 00:39:05 +0200856 n += PyString_GET_SIZE(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000857 /* Remember the str and switch to the next slot */
858 *callresult++ = str;
859 break;
860 }
861 case 'R':
862 {
863 PyObject *obj = va_arg(count, PyObject *);
864 PyObject *repr;
865 assert(obj);
866 repr = PyObject_Repr(obj);
867 if (!repr)
868 goto fail;
869 n += PyUnicode_GET_SIZE(repr);
870 /* Remember the repr and switch to the next slot */
871 *callresult++ = repr;
872 break;
873 }
874 case 'p':
875 (void) va_arg(count, int);
876 /* maximum 64-bit pointer representation:
877 * 0xffffffffffffffff
878 * so 19 characters is enough.
879 * XXX I count 18 -- what's the extra for?
880 */
881 n += 19;
882 break;
883 default:
884 /* if we stumble upon an unknown
885 formatting code, copy the rest of
886 the format string to the output
887 string. (we cannot just skip the
888 code, since there's no way to know
889 what's in the argument list) */
890 n += strlen(p);
891 goto expand;
892 }
893 } else
894 n++;
895 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000896 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000897 if (abuffersize > 20) {
Serhiy Storchaka5ec0bbf2015-01-30 23:35:03 +0200898 /* add 1 for sprintf's trailing null byte */
899 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson857ce152009-01-31 16:29:18 +0000900 if (!abuffer) {
901 PyErr_NoMemory();
902 goto fail;
903 }
904 realbuffer = abuffer;
905 }
906 else
907 realbuffer = buffer;
908 /* step 4: fill the buffer */
909 /* Since we've analyzed how much space we need for the worst case,
910 we don't have to resize the string.
911 There can be no errors beyond this point. */
912 string = PyUnicode_FromUnicode(NULL, n);
913 if (!string)
914 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000915
Benjamin Peterson857ce152009-01-31 16:29:18 +0000916 s = PyUnicode_AS_UNICODE(string);
917 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000918
Benjamin Peterson857ce152009-01-31 16:29:18 +0000919 for (f = format; *f; f++) {
920 if (*f == '%') {
921 const char* p = f++;
922 int longflag = 0;
923 int size_tflag = 0;
924 zeropad = (*f == '0');
925 /* parse the width.precision part */
926 width = 0;
927 while (isdigit((unsigned)*f))
928 width = (width*10) + *f++ - '0';
929 precision = 0;
930 if (*f == '.') {
931 f++;
932 while (isdigit((unsigned)*f))
933 precision = (precision*10) + *f++ - '0';
934 }
935 /* handle the long flag, but only for %ld and %lu.
936 others can be added when necessary. */
937 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
938 longflag = 1;
939 ++f;
940 }
941 /* handle the size_t flag. */
942 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
943 size_tflag = 1;
944 ++f;
945 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000946
Benjamin Peterson857ce152009-01-31 16:29:18 +0000947 switch (*f) {
948 case 'c':
949 *s++ = va_arg(vargs, int);
950 break;
951 case 'd':
952 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
953 if (longflag)
954 sprintf(realbuffer, fmt, va_arg(vargs, long));
955 else if (size_tflag)
956 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
957 else
958 sprintf(realbuffer, fmt, va_arg(vargs, int));
959 appendstring(realbuffer);
960 break;
961 case 'u':
962 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
963 if (longflag)
964 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
965 else if (size_tflag)
966 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
967 else
968 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
969 appendstring(realbuffer);
970 break;
971 case 'i':
972 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
973 sprintf(realbuffer, fmt, va_arg(vargs, int));
974 appendstring(realbuffer);
975 break;
976 case 'x':
977 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
978 sprintf(realbuffer, fmt, va_arg(vargs, int));
979 appendstring(realbuffer);
980 break;
981 case 's':
982 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000983 /* unused, since we already have the result */
984 (void) va_arg(vargs, char *);
985 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
986 PyUnicode_GET_SIZE(*callresult));
987 s += PyUnicode_GET_SIZE(*callresult);
988 /* We're done with the unicode()/repr() => forget it */
989 Py_DECREF(*callresult);
990 /* switch to next unicode()/repr() result */
991 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000992 break;
993 }
994 case 'U':
995 {
996 PyObject *obj = va_arg(vargs, PyObject *);
997 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
998 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
999 s += size;
1000 break;
1001 }
1002 case 'V':
1003 {
1004 PyObject *obj = va_arg(vargs, PyObject *);
1005 const char *str = va_arg(vargs, const char *);
1006 if (obj) {
1007 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1008 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1009 s += size;
1010 } else {
1011 appendstring(str);
1012 }
1013 break;
1014 }
1015 case 'S':
1016 case 'R':
1017 {
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001018 const char *str = PyString_AS_STRING(*callresult);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001019 /* unused, since we already have the result */
1020 (void) va_arg(vargs, PyObject *);
Victor Stinner2af8d2f2014-07-30 00:39:05 +02001021 appendstring(str);
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 /* We're done with the unicode()/repr() => forget it */
1023 Py_DECREF(*callresult);
1024 /* switch to next unicode()/repr() result */
1025 ++callresult;
1026 break;
1027 }
1028 case 'p':
1029 sprintf(buffer, "%p", va_arg(vargs, void*));
1030 /* %p is ill-defined: ensure leading 0x. */
1031 if (buffer[1] == 'X')
1032 buffer[1] = 'x';
1033 else if (buffer[1] != 'x') {
1034 memmove(buffer+2, buffer, strlen(buffer)+1);
1035 buffer[0] = '0';
1036 buffer[1] = 'x';
1037 }
1038 appendstring(buffer);
1039 break;
1040 case '%':
1041 *s++ = '%';
1042 break;
1043 default:
1044 appendstring(p);
1045 goto end;
1046 }
1047 } else
1048 *s++ = *f;
1049 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001050
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001051 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001052 if (callresults)
1053 PyObject_Free(callresults);
1054 if (abuffer)
1055 PyObject_Free(abuffer);
1056 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1057 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001058 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001059 if (callresults) {
1060 PyObject **callresult2 = callresults;
1061 while (callresult2 < callresult) {
1062 Py_DECREF(*callresult2);
1063 ++callresult2;
1064 }
1065 PyObject_Free(callresults);
1066 }
1067 if (abuffer)
1068 PyObject_Free(abuffer);
1069 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001070}
1071
1072#undef appendstring
1073
1074PyObject *
1075PyUnicode_FromFormat(const char *format, ...)
1076{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001077 PyObject* ret;
1078 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001079
1080#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001081 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001082#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001083 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001084#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001085 ret = PyUnicode_FromFormatV(format, vargs);
1086 va_end(vargs);
1087 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001088}
1089
Martin v. Löwis18e16552006-02-15 17:27:45 +00001090Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 wchar_t *w,
1092 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093{
1094 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001095 PyErr_BadInternalCall();
1096 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001098
1099 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001101 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001102
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103#ifdef HAVE_USABLE_WCHAR_T
1104 memcpy(w, unicode->str, size * sizeof(wchar_t));
1105#else
1106 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001107 register Py_UNICODE *u;
1108 register Py_ssize_t i;
1109 u = PyUnicode_AS_UNICODE(unicode);
1110 for (i = size; i > 0; i--)
1111 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 }
1113#endif
1114
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001115 if (size > PyUnicode_GET_SIZE(unicode))
1116 return PyUnicode_GET_SIZE(unicode);
1117 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001118 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119}
1120
1121#endif
1122
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001123PyObject *PyUnicode_FromOrdinal(int ordinal)
1124{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001125 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001126
1127#ifdef Py_UNICODE_WIDE
1128 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001129 PyErr_SetString(PyExc_ValueError,
1130 "unichr() arg not in range(0x110000) "
1131 "(wide Python build)");
1132 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001133 }
1134#else
1135 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001136 PyErr_SetString(PyExc_ValueError,
1137 "unichr() arg not in range(0x10000) "
1138 "(narrow Python build)");
1139 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001140 }
1141#endif
1142
Hye-Shik Chang40574832004-04-06 07:24:51 +00001143 s[0] = (Py_UNICODE)ordinal;
1144 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001145}
1146
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147PyObject *PyUnicode_FromObject(register PyObject *obj)
1148{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001149 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001150 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001151 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001152 Py_INCREF(obj);
1153 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001154 }
1155 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001156 /* For a Unicode subtype that's not a Unicode object,
1157 return a true Unicode object with the same data. */
1158 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1159 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001160 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001161 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1162}
1163
1164PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001165 const char *encoding,
1166 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001167{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001168 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001169 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001170 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001171
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001173 PyErr_BadInternalCall();
1174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001176
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001177#if 0
1178 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001179 that no encodings is given and then redirect to
1180 PyObject_Unicode() which then applies the additional logic for
1181 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001182
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001183 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001184 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001185
1186 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001187 if (PyUnicode_Check(obj)) {
1188 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001189 PyErr_SetString(PyExc_TypeError,
1190 "decoding Unicode is not supported");
1191 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001192 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001193 return PyObject_Unicode(obj);
1194 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001195#else
1196 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001197 PyErr_SetString(PyExc_TypeError,
1198 "decoding Unicode is not supported");
1199 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001200 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001201#endif
1202
1203 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001204 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001205 s = PyString_AS_STRING(obj);
1206 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001207 }
Christian Heimes3497f942008-05-26 12:29:14 +00001208 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001209 /* Python 2.x specific */
1210 PyErr_Format(PyExc_TypeError,
1211 "decoding bytearray is not supported");
1212 return NULL;
1213 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001214 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001215 /* Overwrite the error message with something more useful in
1216 case of a TypeError. */
1217 if (PyErr_ExceptionMatches(PyExc_TypeError))
1218 PyErr_Format(PyExc_TypeError,
1219 "coercing to Unicode: need string or buffer, "
1220 "%.80s found",
1221 Py_TYPE(obj)->tp_name);
1222 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001223 }
Tim Petersced69f82003-09-16 20:30:58 +00001224
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001225 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001226 if (len == 0)
1227 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001228
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001229 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001230 return v;
1231
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001232 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001233 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234}
1235
1236PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001237 Py_ssize_t size,
1238 const char *encoding,
1239 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240{
1241 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001242
1243 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001244 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001245
1246 /* Shortcuts for common default encodings */
1247 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001249 else if (strcmp(encoding, "latin-1") == 0)
1250 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001251#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1252 else if (strcmp(encoding, "mbcs") == 0)
1253 return PyUnicode_DecodeMBCS(s, size, errors);
1254#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001255 else if (strcmp(encoding, "ascii") == 0)
1256 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001257
1258 /* Decode via the codec registry */
1259 buffer = PyBuffer_FromMemory((void *)s, size);
1260 if (buffer == NULL)
1261 goto onError;
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001262 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 if (unicode == NULL)
1264 goto onError;
1265 if (!PyUnicode_Check(unicode)) {
1266 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001267 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001268 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269 Py_DECREF(unicode);
1270 goto onError;
1271 }
1272 Py_DECREF(buffer);
1273 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001274
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001275 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 Py_XDECREF(buffer);
1277 return NULL;
1278}
1279
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001280PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1281 const char *encoding,
1282 const char *errors)
1283{
1284 PyObject *v;
1285
1286 if (!PyUnicode_Check(unicode)) {
1287 PyErr_BadArgument();
1288 goto onError;
1289 }
1290
Serhiy Storchakae37003e2015-12-03 20:47:48 +02001291 if (PyErr_WarnPy3k("decoding Unicode is not supported in 3.x", 1) < 0)
1292 goto onError;
1293
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001294 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001295 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001296
1297 /* Decode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001298 v = _PyCodec_DecodeText(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001299 if (v == NULL)
1300 goto onError;
1301 return v;
1302
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001303 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001304 return NULL;
1305}
1306
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001308 Py_ssize_t size,
1309 const char *encoding,
1310 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311{
1312 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001313
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 unicode = PyUnicode_FromUnicode(s, size);
1315 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1318 Py_DECREF(unicode);
1319 return v;
1320}
1321
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001322PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1323 const char *encoding,
1324 const char *errors)
1325{
1326 PyObject *v;
1327
1328 if (!PyUnicode_Check(unicode)) {
1329 PyErr_BadArgument();
1330 goto onError;
1331 }
1332
1333 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001334 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001335
1336 /* Encode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001337 v = _PyCodec_EncodeText(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001338 if (v == NULL)
1339 goto onError;
1340 return v;
1341
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001342 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001343 return NULL;
1344}
1345
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1347 const char *encoding,
1348 const char *errors)
1349{
1350 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001351
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352 if (!PyUnicode_Check(unicode)) {
1353 PyErr_BadArgument();
1354 goto onError;
1355 }
Fred Drakee4315f52000-05-09 19:53:39 +00001356
Tim Petersced69f82003-09-16 20:30:58 +00001357 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001358 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001359
1360 /* Shortcuts for common default encodings */
1361 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001362 if (strcmp(encoding, "utf-8") == 0)
1363 return PyUnicode_AsUTF8String(unicode);
1364 else if (strcmp(encoding, "latin-1") == 0)
1365 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001366#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001367 else if (strcmp(encoding, "mbcs") == 0)
1368 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001369#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001370 else if (strcmp(encoding, "ascii") == 0)
1371 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373
1374 /* Encode via the codec registry */
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001375 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 if (v == NULL)
1377 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001378 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001380 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001381 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 Py_DECREF(v);
1383 goto onError;
1384 }
1385 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001386
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001387 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388 return NULL;
1389}
1390
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001391PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001392 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001393{
1394 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1395
1396 if (v)
1397 return v;
1398 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1399 if (v && errors == NULL)
1400 ((PyUnicodeObject *)unicode)->defenc = v;
1401 return v;
1402}
1403
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1405{
1406 if (!PyUnicode_Check(unicode)) {
1407 PyErr_BadArgument();
1408 goto onError;
1409 }
1410 return PyUnicode_AS_UNICODE(unicode);
1411
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001412 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001413 return NULL;
1414}
1415
Martin v. Löwis18e16552006-02-15 17:27:45 +00001416Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417{
1418 if (!PyUnicode_Check(unicode)) {
1419 PyErr_BadArgument();
1420 goto onError;
1421 }
1422 return PyUnicode_GET_SIZE(unicode);
1423
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001424 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425 return -1;
1426}
1427
Thomas Wouters78890102000-07-22 19:25:51 +00001428const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001429{
1430 return unicode_default_encoding;
1431}
1432
1433int PyUnicode_SetDefaultEncoding(const char *encoding)
1434{
1435 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001436
Fred Drakee4315f52000-05-09 19:53:39 +00001437 /* Make sure the encoding is valid. As side effect, this also
1438 loads the encoding into the codec registry cache. */
1439 v = _PyCodec_Lookup(encoding);
1440 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001442 Py_DECREF(v);
1443 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001444 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001445 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001446 return 0;
1447
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001448 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001449 return -1;
1450}
1451
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452/* error handling callback helper:
1453 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001454 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001455 and adjust various state variables.
1456 return 0 on success, -1 on error
1457*/
1458
1459static
1460int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001461 const char *encoding, const char *reason,
1462 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1463 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1464 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001465{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001466 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001467
1468 PyObject *restuple = NULL;
1469 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001470 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1471 Py_ssize_t requiredsize;
1472 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001473 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001474 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001475 int res = -1;
1476
1477 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001478 *errorHandler = PyCodec_LookupError(errors);
1479 if (*errorHandler == NULL)
1480 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001481 }
1482
1483 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001484 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001485 encoding, input, insize, *startinpos, *endinpos, reason);
1486 if (*exceptionObject == NULL)
1487 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001488 }
1489 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001490 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1491 goto onError;
1492 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1493 goto onError;
1494 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1495 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001496 }
1497
1498 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1499 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001500 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001501 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001502 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001503 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 }
1505 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001506 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001508 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001509 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001510 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1511 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001512 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001513
1514 /* need more space? (at least enough for what we
1515 have+the replacement+the rest of the string (starting
1516 at the new input position), so we won't have to check space
1517 when there are no errors in the rest of the string) */
1518 repptr = PyUnicode_AS_UNICODE(repunicode);
1519 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001520 requiredsize = *outpos;
1521 if (requiredsize > PY_SSIZE_T_MAX - repsize)
1522 goto overflow;
1523 requiredsize += repsize;
1524 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1525 goto overflow;
1526 requiredsize += insize - newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001527 if (requiredsize > outsize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001528 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001529 requiredsize = 2*outsize;
1530 if (_PyUnicode_Resize(output, requiredsize) < 0)
1531 goto onError;
1532 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001533 }
1534 *endinpos = newpos;
1535 *inptr = input + newpos;
1536 Py_UNICODE_COPY(*outptr, repptr, repsize);
1537 *outptr += repsize;
1538 *outpos += repsize;
1539 /* we made it! */
1540 res = 0;
1541
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001542 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001543 Py_XDECREF(restuple);
1544 return res;
Benjamin Petersoned4c1302014-09-29 18:18:57 -04001545
1546 overflow:
1547 PyErr_SetString(PyExc_OverflowError,
1548 "decoded result is too long for a Python string");
1549 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001550}
1551
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001552/* --- UTF-7 Codec -------------------------------------------------------- */
1553
Antoine Pitrou653dece2009-05-04 18:32:32 +00001554/* See RFC2152 for details. We encode conservatively and decode liberally. */
1555
1556/* Three simple macros defining base-64. */
1557
1558/* Is c a base-64 character? */
1559
1560#define IS_BASE64(c) \
Serhiy Storchaka462502b2015-10-10 09:33:11 +03001561 (((c) >= 'A' && (c) <= 'Z') || \
1562 ((c) >= 'a' && (c) <= 'z') || \
1563 ((c) >= '0' && (c) <= '9') || \
1564 (c) == '+' || (c) == '/')
Antoine Pitrou653dece2009-05-04 18:32:32 +00001565
1566/* given that c is a base-64 character, what is its base-64 value? */
1567
1568#define FROM_BASE64(c) \
1569 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1570 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1571 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1572 (c) == '+' ? 62 : 63)
1573
1574/* What is the base-64 character of the bottom 6 bits of n? */
1575
1576#define TO_BASE64(n) \
1577 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1578
1579/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1580 * decoded as itself. We are permissive on decoding; the only ASCII
1581 * byte not decoding to itself is the + which begins a base64
1582 * string. */
1583
1584#define DECODE_DIRECT(c) \
1585 ((c) <= 127 && (c) != '+')
1586
1587/* The UTF-7 encoder treats ASCII characters differently according to
1588 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1589 * the above). See RFC2152. This array identifies these different
1590 * sets:
1591 * 0 : "Set D"
1592 * alphanumeric and '(),-./:?
1593 * 1 : "Set O"
1594 * !"#$%&*;<=>@[]^_`{|}
1595 * 2 : "whitespace"
1596 * ht nl cr sp
1597 * 3 : special (must be base64 encoded)
1598 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1599 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001600
Tim Petersced69f82003-09-16 20:30:58 +00001601static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001602char utf7_category[128] = {
1603/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1604 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1605/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1606 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1607/* sp ! " # $ % & ' ( ) * + , - . / */
1608 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1609/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1610 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1611/* @ A B C D E F G H I J K L M N O */
1612 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1613/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1614 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1615/* ` a b c d e f g h i j k l m n o */
1616 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1617/* p q r s t u v w x y z { | } ~ del */
1618 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001619};
1620
Antoine Pitrou653dece2009-05-04 18:32:32 +00001621/* ENCODE_DIRECT: this character should be encoded as itself. The
1622 * answer depends on whether we are encoding set O as itself, and also
1623 * on whether we are encoding whitespace as itself. RFC2152 makes it
1624 * clear that the answers to these questions vary between
1625 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001626
Antoine Pitrou653dece2009-05-04 18:32:32 +00001627#define ENCODE_DIRECT(c, directO, directWS) \
1628 ((c) < 128 && (c) > 0 && \
1629 ((utf7_category[(c)] == 0) || \
1630 (directWS && (utf7_category[(c)] == 2)) || \
1631 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001632
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001633PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001634 Py_ssize_t size,
1635 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001637 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1638}
1639
Antoine Pitrou653dece2009-05-04 18:32:32 +00001640/* The decoder. The only state we preserve is our read position,
1641 * i.e. how many characters we have consumed. So if we end in the
1642 * middle of a shift sequence we have to back off the read position
1643 * and the output to the beginning of the sequence, otherwise we lose
1644 * all the shift state (seen bits, number of bits seen, high
1645 * surrogate). */
1646
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001647PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001648 Py_ssize_t size,
1649 const char *errors,
1650 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001651{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001653 Py_ssize_t startinpos;
1654 Py_ssize_t endinpos;
1655 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001656 const char *e;
1657 PyUnicodeObject *unicode;
1658 Py_UNICODE *p;
1659 const char *errmsg = "";
1660 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001661 Py_UNICODE *shiftOutStart;
1662 unsigned int base64bits = 0;
1663 unsigned long base64buffer = 0;
1664 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001665 PyObject *errorHandler = NULL;
1666 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001667
1668 unicode = _PyUnicode_New(size);
1669 if (!unicode)
1670 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001671 if (size == 0) {
1672 if (consumed)
1673 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001674 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001675 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676
1677 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001678 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 e = s + size;
1680
1681 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001682 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001683
Antoine Pitrou653dece2009-05-04 18:32:32 +00001684 if (inShift) { /* in a base-64 section */
1685 if (IS_BASE64(ch)) { /* consume a base-64 character */
1686 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1687 base64bits += 6;
1688 s++;
1689 if (base64bits >= 16) {
1690 /* we have enough bits for a UTF-16 value */
1691 Py_UNICODE outCh = (Py_UNICODE)
1692 (base64buffer >> (base64bits-16));
1693 base64bits -= 16;
1694 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001695 assert(outCh <= 0xffff);
Antoine Pitrou653dece2009-05-04 18:32:32 +00001696 if (surrogate) {
1697 /* expecting a second surrogate */
1698 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1699#ifdef Py_UNICODE_WIDE
1700 *p++ = (((surrogate & 0x3FF)<<10)
1701 | (outCh & 0x3FF)) + 0x10000;
1702#else
1703 *p++ = surrogate;
1704 *p++ = outCh;
1705#endif
1706 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001707 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001708 }
1709 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001710 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001711 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001712 }
1713 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001714 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001715 /* first surrogate */
1716 surrogate = outCh;
1717 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001718 else {
1719 *p++ = outCh;
1720 }
1721 }
1722 }
1723 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001724 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001725 if (base64bits > 0) { /* left-over bits */
1726 if (base64bits >= 6) {
1727 /* We've seen at least one base-64 character */
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001728 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001729 errmsg = "partial character in shift sequence";
1730 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001731 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001732 else {
1733 /* Some bits remain; they should be zero */
1734 if (base64buffer != 0) {
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001735 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 errmsg = "non-zero padding bits in shift sequence";
1737 goto utf7Error;
1738 }
1739 }
1740 }
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001741 if (surrogate && DECODE_DIRECT(ch))
1742 *p++ = surrogate;
1743 surrogate = 0;
1744 if (ch == '-') {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001745 /* '-' is absorbed; other terminating
1746 characters are preserved */
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001747 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001748 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 }
1750 }
1751 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001753 s++; /* consume '+' */
1754 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 s++;
1756 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001757 }
1758 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759 inShift = 1;
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001760 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001761 shiftOutStart = p;
1762 base64bits = 0;
Serhiy Storchakaf1056722013-10-19 20:37:49 +03001763 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001764 }
1765 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001766 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 *p++ = ch;
1768 s++;
1769 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001770 else {
1771 startinpos = s-starts;
1772 s++;
1773 errmsg = "unexpected special character";
1774 goto utf7Error;
1775 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001777utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001778 outpos = p-PyUnicode_AS_UNICODE(unicode);
1779 endinpos = s-starts;
1780 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001781 errors, &errorHandler,
1782 "utf7", errmsg,
1783 starts, size, &startinpos, &endinpos, &exc, &s,
1784 &unicode, &outpos, &p))
1785 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001786 }
1787
Antoine Pitrou653dece2009-05-04 18:32:32 +00001788 /* end of string */
1789
1790 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1791 /* if we're in an inconsistent state, that's an error */
Serhiy Storchakae12f6322015-10-02 13:14:53 +03001792 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001793 if (surrogate ||
1794 (base64bits >= 6) ||
1795 (base64bits > 0 && base64buffer != 0)) {
1796 outpos = p-PyUnicode_AS_UNICODE(unicode);
1797 endinpos = size;
1798 if (unicode_decode_call_errorhandler(
1799 errors, &errorHandler,
1800 "utf7", "unterminated shift sequence",
1801 starts, size, &startinpos, &endinpos, &exc, &s,
1802 &unicode, &outpos, &p))
1803 goto onError;
1804 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001805 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001806
1807 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001808 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001809 if (inShift) {
1810 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001811 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001812 }
1813 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001814 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001815 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001816 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001817
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001818 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001819 goto onError;
1820
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001821 Py_XDECREF(errorHandler);
1822 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001823 return (PyObject *)unicode;
1824
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001825 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 Py_XDECREF(errorHandler);
1827 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001828 Py_DECREF(unicode);
1829 return NULL;
1830}
1831
1832
1833PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001834 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001835 int base64SetO,
1836 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001837 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001838{
1839 PyObject *v;
1840 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001841 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001842 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001843 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001844 unsigned int base64bits = 0;
1845 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001846 char * out;
1847 char * start;
1848
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001849 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001850 return PyErr_NoMemory();
1851
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001852 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001853 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001854
Antoine Pitrou653dece2009-05-04 18:32:32 +00001855 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001856 if (v == NULL)
1857 return NULL;
1858
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001859 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001860 for (;i < size; ++i) {
1861 Py_UNICODE ch = s[i];
1862
Antoine Pitrou653dece2009-05-04 18:32:32 +00001863 if (inShift) {
1864 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1865 /* shifting out */
1866 if (base64bits) { /* output remaining bits */
1867 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1868 base64buffer = 0;
1869 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001870 }
1871 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001872 /* Characters not in the BASE64 set implicitly unshift the sequence
1873 so no '-' is required, except if the character is itself a '-' */
1874 if (IS_BASE64(ch) || ch == '-') {
1875 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001876 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001877 *out++ = (char) ch;
1878 }
1879 else {
1880 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001881 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001882 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001883 else { /* not in a shift sequence */
1884 if (ch == '+') {
1885 *out++ = '+';
1886 *out++ = '-';
1887 }
1888 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1889 *out++ = (char) ch;
1890 }
1891 else {
1892 *out++ = '+';
1893 inShift = 1;
1894 goto encode_char;
1895 }
1896 }
1897 continue;
1898encode_char:
1899#ifdef Py_UNICODE_WIDE
1900 if (ch >= 0x10000) {
1901 /* code first surrogate */
1902 base64bits += 16;
1903 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1904 while (base64bits >= 6) {
1905 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1906 base64bits -= 6;
1907 }
1908 /* prepare second surrogate */
1909 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1910 }
1911#endif
1912 base64bits += 16;
1913 base64buffer = (base64buffer << 16) | ch;
1914 while (base64bits >= 6) {
1915 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1916 base64bits -= 6;
1917 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001918 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001919 if (base64bits)
1920 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1921 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001922 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001923
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001924 if (_PyString_Resize(&v, out - start))
1925 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001926 return v;
1927}
1928
Antoine Pitrou653dece2009-05-04 18:32:32 +00001929#undef IS_BASE64
1930#undef FROM_BASE64
1931#undef TO_BASE64
1932#undef DECODE_DIRECT
1933#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001934
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935/* --- UTF-8 Codec -------------------------------------------------------- */
1936
Tim Petersced69f82003-09-16 20:30:58 +00001937static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001939 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1940 illegal prefix. See RFC 3629 for details */
1941 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001943 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001944 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1945 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1946 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1947 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001948 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1949 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001950 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1951 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001952 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1953 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1954 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1955 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1956 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957};
1958
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001960 Py_ssize_t size,
1961 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962{
Walter Dörwald69652032004-09-07 20:24:22 +00001963 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1964}
1965
1966PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001967 Py_ssize_t size,
1968 const char *errors,
1969 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001970{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001971 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001973 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001974 Py_ssize_t startinpos;
1975 Py_ssize_t endinpos;
1976 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 const char *e;
1978 PyUnicodeObject *unicode;
1979 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001980 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001981 PyObject *errorHandler = NULL;
1982 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983
1984 /* Note: size will always be longer than the resulting Unicode
1985 character count */
1986 unicode = _PyUnicode_New(size);
1987 if (!unicode)
1988 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001989 if (size == 0) {
1990 if (consumed)
1991 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994
1995 /* Unpack UTF-8 encoded data */
1996 p = unicode->str;
1997 e = s + size;
1998
1999 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002000 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
2002 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002003 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002004 s++;
2005 continue;
2006 }
2007
2008 n = utf8_code_length[ch];
2009
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002010 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002011 if (consumed)
2012 break;
2013 else {
2014 errmsg = "unexpected end of data";
2015 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002016 endinpos = startinpos+1;
2017 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2018 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002019 goto utf8Error;
2020 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022
2023 switch (n) {
2024
2025 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002026 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002027 startinpos = s-starts;
2028 endinpos = startinpos+1;
2029 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030
2031 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002032 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002033 startinpos = s-starts;
2034 endinpos = startinpos+1;
2035 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036
2037 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002038 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00002039 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002040 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002041 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002042 goto utf8Error;
2043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002045 assert ((ch > 0x007F) && (ch <= 0x07FF));
2046 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 break;
2048
2049 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002050 /* XXX: surrogates shouldn't be valid UTF-8!
2051 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2052 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2053 Uncomment the 2 lines below to make them invalid,
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002054 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00002055 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002056 (s[2] & 0xc0) != 0x80 ||
2057 ((unsigned char)s[0] == 0xE0 &&
2058 (unsigned char)s[1] < 0xA0)/* ||
2059 ((unsigned char)s[0] == 0xED &&
2060 (unsigned char)s[1] > 0x9F)*/) {
2061 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002062 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002063 endinpos = startinpos + 1;
2064
2065 /* if s[1] first two bits are 1 and 0, then the invalid
2066 continuation byte is s[2], so increment endinpos by 1,
2067 if not, s[1] is invalid and endinpos doesn't need to
2068 be incremented. */
2069 if ((s[1] & 0xC0) == 0x80)
2070 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002071 goto utf8Error;
2072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002074 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2075 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002076 break;
2077
2078 case 4:
2079 if ((s[1] & 0xc0) != 0x80 ||
2080 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002081 (s[3] & 0xc0) != 0x80 ||
2082 ((unsigned char)s[0] == 0xF0 &&
2083 (unsigned char)s[1] < 0x90) ||
2084 ((unsigned char)s[0] == 0xF4 &&
2085 (unsigned char)s[1] > 0x8F)) {
2086 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002087 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002088 endinpos = startinpos + 1;
2089 if ((s[1] & 0xC0) == 0x80) {
2090 endinpos++;
2091 if ((s[2] & 0xC0) == 0x80)
2092 endinpos++;
2093 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002094 goto utf8Error;
2095 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002096 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002097 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2098 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2099
Fredrik Lundh8f455852001-06-27 18:59:43 +00002100#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002101 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002102#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002103 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002104
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002105 /* translate from 10000..10FFFF to 0..FFFF */
2106 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002107
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002108 /* high surrogate = top 10 bits added to D800 */
2109 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002110
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002111 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002112 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002113#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 }
2116 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002117 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002118
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002119 utf8Error:
2120 outpos = p-PyUnicode_AS_UNICODE(unicode);
2121 if (unicode_decode_call_errorhandler(
2122 errors, &errorHandler,
2123 "utf8", errmsg,
2124 starts, size, &startinpos, &endinpos, &exc, &s,
2125 &unicode, &outpos, &p))
2126 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 }
Walter Dörwald69652032004-09-07 20:24:22 +00002128 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002129 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130
2131 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002132 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 goto onError;
2134
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002135 Py_XDECREF(errorHandler);
2136 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137 return (PyObject *)unicode;
2138
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002139 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002140 Py_XDECREF(errorHandler);
2141 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 Py_DECREF(unicode);
2143 return NULL;
2144}
2145
Tim Peters602f7402002-04-27 18:03:26 +00002146/* Allocation strategy: if the string is short, convert into a stack buffer
2147 and allocate exactly as much space needed at the end. Else allocate the
2148 maximum possible needed (4 result bytes per Unicode character), and return
2149 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002150*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002151PyObject *
2152PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002153 Py_ssize_t size,
2154 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155{
Tim Peters602f7402002-04-27 18:03:26 +00002156#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002157
Martin v. Löwis18e16552006-02-15 17:27:45 +00002158 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002159 PyObject *v; /* result string object */
2160 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002161 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002162 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002163 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002164
Tim Peters602f7402002-04-27 18:03:26 +00002165 assert(s != NULL);
2166 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167
Tim Peters602f7402002-04-27 18:03:26 +00002168 if (size <= MAX_SHORT_UNICHARS) {
2169 /* Write into the stack buffer; nallocated can't overflow.
2170 * At the end, we'll allocate exactly as much heap space as it
2171 * turns out we need.
2172 */
2173 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2174 v = NULL; /* will allocate after we're done */
2175 p = stackbuf;
2176 }
2177 else {
2178 /* Overallocate on the heap, and give the excess back at the end. */
2179 nallocated = size * 4;
2180 if (nallocated / 4 != size) /* overflow! */
2181 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002182 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002183 if (v == NULL)
2184 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002185 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002186 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002187
Tim Peters602f7402002-04-27 18:03:26 +00002188 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002189 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002190
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002191 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002192 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002194
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002196 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002197 *p++ = (char)(0xc0 | (ch >> 6));
2198 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002199 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002200 else {
Tim Peters602f7402002-04-27 18:03:26 +00002201 /* Encode UCS2 Unicode ordinals */
2202 if (ch < 0x10000) {
2203 /* Special case: check for high surrogate */
2204 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2205 Py_UCS4 ch2 = s[i];
2206 /* Check for low surrogate and combine the two to
2207 form a UCS4 value */
2208 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002209 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002210 i++;
2211 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002212 }
Tim Peters602f7402002-04-27 18:03:26 +00002213 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002214 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002215 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002216 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2217 *p++ = (char)(0x80 | (ch & 0x3f));
2218 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002219 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002220 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002221 /* Encode UCS4 Unicode ordinals */
2222 *p++ = (char)(0xf0 | (ch >> 18));
2223 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2224 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2225 *p++ = (char)(0x80 | (ch & 0x3f));
2226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002228
Tim Peters602f7402002-04-27 18:03:26 +00002229 if (v == NULL) {
2230 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002231 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002232 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002233 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002234 }
2235 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002236 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002237 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002238 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002239 if (_PyString_Resize(&v, nneeded))
2240 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002243
Tim Peters602f7402002-04-27 18:03:26 +00002244#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245}
2246
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2248{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249 if (!PyUnicode_Check(unicode)) {
2250 PyErr_BadArgument();
2251 return NULL;
2252 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002253 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002254 PyUnicode_GET_SIZE(unicode),
2255 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256}
2257
Walter Dörwald6e390802007-08-17 16:41:28 +00002258/* --- UTF-32 Codec ------------------------------------------------------- */
2259
2260PyObject *
2261PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002262 Py_ssize_t size,
2263 const char *errors,
2264 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002265{
2266 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2267}
2268
2269PyObject *
2270PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002271 Py_ssize_t size,
2272 const char *errors,
2273 int *byteorder,
2274 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002275{
2276 const char *starts = s;
2277 Py_ssize_t startinpos;
2278 Py_ssize_t endinpos;
2279 Py_ssize_t outpos;
2280 PyUnicodeObject *unicode;
2281 Py_UNICODE *p;
2282#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002283 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002284 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002285#else
2286 const int pairs = 0;
2287#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002288 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002289 int bo = 0; /* assume native ordering by default */
2290 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002291 /* Offsets from q for retrieving bytes in the right order. */
2292#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2293 int iorder[] = {0, 1, 2, 3};
2294#else
2295 int iorder[] = {3, 2, 1, 0};
2296#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002297 PyObject *errorHandler = NULL;
2298 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002299
Walter Dörwald6e390802007-08-17 16:41:28 +00002300 q = (unsigned char *)s;
2301 e = q + size;
2302
2303 if (byteorder)
2304 bo = *byteorder;
2305
2306 /* Check for BOM marks (U+FEFF) in the input and adjust current
2307 byte order setting accordingly. In native mode, the leading BOM
2308 mark is skipped, in all other modes, it is copied to the output
2309 stream as-is (giving a ZWNBSP character). */
2310 if (bo == 0) {
2311 if (size >= 4) {
2312 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002313 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002314#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002315 if (bom == 0x0000FEFF) {
2316 q += 4;
2317 bo = -1;
2318 }
2319 else if (bom == 0xFFFE0000) {
2320 q += 4;
2321 bo = 1;
2322 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002323#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002324 if (bom == 0x0000FEFF) {
2325 q += 4;
2326 bo = 1;
2327 }
2328 else if (bom == 0xFFFE0000) {
2329 q += 4;
2330 bo = -1;
2331 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002332#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002333 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002334 }
2335
2336 if (bo == -1) {
2337 /* force LE */
2338 iorder[0] = 0;
2339 iorder[1] = 1;
2340 iorder[2] = 2;
2341 iorder[3] = 3;
2342 }
2343 else if (bo == 1) {
2344 /* force BE */
2345 iorder[0] = 3;
2346 iorder[1] = 2;
2347 iorder[2] = 1;
2348 iorder[3] = 0;
2349 }
2350
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002351 /* On narrow builds we split characters outside the BMP into two
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002352 code points => count how much extra space we need. */
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002353#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002354 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002355 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2356 pairs++;
2357#endif
2358
2359 /* This might be one to much, because of a BOM */
2360 unicode = _PyUnicode_New((size+3)/4+pairs);
2361 if (!unicode)
2362 return NULL;
2363 if (size == 0)
2364 return (PyObject *)unicode;
2365
2366 /* Unpack UTF-32 encoded data */
2367 p = unicode->str;
2368
Walter Dörwald6e390802007-08-17 16:41:28 +00002369 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002370 Py_UCS4 ch;
2371 /* remaining bytes at the end? (size should be divisible by 4) */
2372 if (e-q<4) {
2373 if (consumed)
2374 break;
2375 errmsg = "truncated data";
2376 startinpos = ((const char *)q)-starts;
2377 endinpos = ((const char *)e)-starts;
2378 goto utf32Error;
2379 /* The remaining input chars are ignored if the callback
2380 chooses to skip the input */
2381 }
2382 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2383 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002384
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002385 if (ch >= 0x110000)
2386 {
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002387 errmsg = "code point not in range(0x110000)";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002388 startinpos = ((const char *)q)-starts;
2389 endinpos = startinpos+4;
2390 goto utf32Error;
2391 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002392#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002393 if (ch >= 0x10000)
2394 {
2395 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2396 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2397 }
2398 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002399#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002400 *p++ = ch;
2401 q += 4;
2402 continue;
2403 utf32Error:
2404 outpos = p-PyUnicode_AS_UNICODE(unicode);
2405 if (unicode_decode_call_errorhandler(
2406 errors, &errorHandler,
2407 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002408 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002409 &unicode, &outpos, &p))
2410 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002411 }
2412
2413 if (byteorder)
2414 *byteorder = bo;
2415
2416 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002417 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002418
2419 /* Adjust length */
2420 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2421 goto onError;
2422
2423 Py_XDECREF(errorHandler);
2424 Py_XDECREF(exc);
2425 return (PyObject *)unicode;
2426
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002427 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002428 Py_DECREF(unicode);
2429 Py_XDECREF(errorHandler);
2430 Py_XDECREF(exc);
2431 return NULL;
2432}
2433
2434PyObject *
2435PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002436 Py_ssize_t size,
2437 const char *errors,
2438 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002439{
2440 PyObject *v;
2441 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002442 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002443#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002444 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002445#else
2446 const int pairs = 0;
2447#endif
2448 /* Offsets from p for storing byte pairs in the right order. */
2449#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2450 int iorder[] = {0, 1, 2, 3};
2451#else
2452 int iorder[] = {3, 2, 1, 0};
2453#endif
2454
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002455#define STORECHAR(CH) \
2456 do { \
2457 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2458 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2459 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2460 p[iorder[0]] = (CH) & 0xff; \
2461 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002462 } while(0)
2463
Serhiy Storchakae8c9e142015-01-18 11:42:50 +02002464 /* In narrow builds we can output surrogate pairs as one code point,
Walter Dörwald6e390802007-08-17 16:41:28 +00002465 so we need less space. */
2466#ifndef Py_UNICODE_WIDE
2467 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002468 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2469 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2470 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002471#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002472 nsize = (size - pairs + (byteorder == 0));
2473 bytesize = nsize * 4;
2474 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002475 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002476 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002477 if (v == NULL)
2478 return NULL;
2479
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002480 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002481 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002482 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002483 if (size == 0)
2484 return v;
2485
2486 if (byteorder == -1) {
2487 /* force LE */
2488 iorder[0] = 0;
2489 iorder[1] = 1;
2490 iorder[2] = 2;
2491 iorder[3] = 3;
2492 }
2493 else if (byteorder == 1) {
2494 /* force BE */
2495 iorder[0] = 3;
2496 iorder[1] = 2;
2497 iorder[2] = 1;
2498 iorder[3] = 0;
2499 }
2500
2501 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002502 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002503#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002504 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2505 Py_UCS4 ch2 = *s;
2506 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2507 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2508 s++;
2509 size--;
2510 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002511 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002512#endif
2513 STORECHAR(ch);
2514 }
2515 return v;
2516#undef STORECHAR
2517}
2518
2519PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2520{
2521 if (!PyUnicode_Check(unicode)) {
2522 PyErr_BadArgument();
2523 return NULL;
2524 }
2525 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002526 PyUnicode_GET_SIZE(unicode),
2527 NULL,
2528 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002529}
2530
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531/* --- UTF-16 Codec ------------------------------------------------------- */
2532
Tim Peters772747b2001-08-09 22:21:55 +00002533PyObject *
2534PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002535 Py_ssize_t size,
2536 const char *errors,
2537 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538{
Walter Dörwald69652032004-09-07 20:24:22 +00002539 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2540}
2541
2542PyObject *
2543PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002544 Py_ssize_t size,
2545 const char *errors,
2546 int *byteorder,
2547 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002548{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002550 Py_ssize_t startinpos;
2551 Py_ssize_t endinpos;
2552 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 PyUnicodeObject *unicode;
2554 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002555 const unsigned char *q, *e;
2556 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002557 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002558 /* Offsets from q for retrieving byte pairs in the right order. */
2559#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2560 int ihi = 1, ilo = 0;
2561#else
2562 int ihi = 0, ilo = 1;
2563#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002564 PyObject *errorHandler = NULL;
2565 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566
2567 /* Note: size will always be longer than the resulting Unicode
2568 character count */
2569 unicode = _PyUnicode_New(size);
2570 if (!unicode)
2571 return NULL;
2572 if (size == 0)
2573 return (PyObject *)unicode;
2574
2575 /* Unpack UTF-16 encoded data */
2576 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002577 q = (unsigned char *)s;
2578 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579
2580 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002581 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002583 /* Check for BOM marks (U+FEFF) in the input and adjust current
2584 byte order setting accordingly. In native mode, the leading BOM
2585 mark is skipped, in all other modes, it is copied to the output
2586 stream as-is (giving a ZWNBSP character). */
2587 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002588 if (size >= 2) {
2589 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002590#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002591 if (bom == 0xFEFF) {
2592 q += 2;
2593 bo = -1;
2594 }
2595 else if (bom == 0xFFFE) {
2596 q += 2;
2597 bo = 1;
2598 }
Tim Petersced69f82003-09-16 20:30:58 +00002599#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002600 if (bom == 0xFEFF) {
2601 q += 2;
2602 bo = 1;
2603 }
2604 else if (bom == 0xFFFE) {
2605 q += 2;
2606 bo = -1;
2607 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002608#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002609 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611
Tim Peters772747b2001-08-09 22:21:55 +00002612 if (bo == -1) {
2613 /* force LE */
2614 ihi = 1;
2615 ilo = 0;
2616 }
2617 else if (bo == 1) {
2618 /* force BE */
2619 ihi = 0;
2620 ilo = 1;
2621 }
2622
2623 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002624 Py_UNICODE ch;
2625 /* remaining bytes at the end? (size should be even) */
2626 if (e-q<2) {
2627 if (consumed)
2628 break;
2629 errmsg = "truncated data";
2630 startinpos = ((const char *)q)-starts;
2631 endinpos = ((const char *)e)-starts;
2632 goto utf16Error;
2633 /* The remaining input chars are ignored if the callback
2634 chooses to skip the input */
2635 }
2636 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002637
Benjamin Peterson857ce152009-01-31 16:29:18 +00002638 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002639
2640 if (ch < 0xD800 || ch > 0xDFFF) {
2641 *p++ = ch;
2642 continue;
2643 }
2644
2645 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002646 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002647 q -= 2;
2648 if (consumed)
2649 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002650 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002651 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002652 endinpos = ((const char *)e)-starts;
2653 goto utf16Error;
2654 }
2655 if (0xD800 <= ch && ch <= 0xDBFF) {
2656 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2657 q += 2;
2658 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002659#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002660 *p++ = ch;
2661 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002662#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002663 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002664#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002665 continue;
2666 }
2667 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002668 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002669 startinpos = (((const char *)q)-4)-starts;
2670 endinpos = startinpos+2;
2671 goto utf16Error;
2672 }
2673
Benjamin Peterson857ce152009-01-31 16:29:18 +00002674 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002675 errmsg = "illegal encoding";
2676 startinpos = (((const char *)q)-2)-starts;
2677 endinpos = startinpos+2;
2678 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002679
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002680 utf16Error:
2681 outpos = p-PyUnicode_AS_UNICODE(unicode);
2682 if (unicode_decode_call_errorhandler(
2683 errors, &errorHandler,
2684 "utf16", errmsg,
2685 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2686 &unicode, &outpos, &p))
2687 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688 }
2689
2690 if (byteorder)
2691 *byteorder = bo;
2692
Walter Dörwald69652032004-09-07 20:24:22 +00002693 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002694 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002695
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002697 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 goto onError;
2699
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002700 Py_XDECREF(errorHandler);
2701 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 return (PyObject *)unicode;
2703
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002704 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 Py_XDECREF(errorHandler);
2707 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 return NULL;
2709}
2710
Tim Peters772747b2001-08-09 22:21:55 +00002711PyObject *
2712PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002713 Py_ssize_t size,
2714 const char *errors,
2715 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716{
2717 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002718 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002719 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002720#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002721 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002722#else
2723 const int pairs = 0;
2724#endif
Tim Peters772747b2001-08-09 22:21:55 +00002725 /* Offsets from p for storing byte pairs in the right order. */
2726#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2727 int ihi = 1, ilo = 0;
2728#else
2729 int ihi = 0, ilo = 1;
2730#endif
2731
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002732#define STORECHAR(CH) \
2733 do { \
2734 p[ihi] = ((CH) >> 8) & 0xff; \
2735 p[ilo] = (CH) & 0xff; \
2736 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002737 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002739#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002740 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002741 if (s[i] >= 0x10000)
2742 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002743#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002744 /* 2 * (size + pairs + (byteorder == 0)) */
2745 if (size > PY_SSIZE_T_MAX ||
2746 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002747 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002748 nsize = size + pairs + (byteorder == 0);
2749 bytesize = nsize * 2;
2750 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002751 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002752 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 if (v == NULL)
2754 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002756 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002758 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002759 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002760 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002761
2762 if (byteorder == -1) {
2763 /* force LE */
2764 ihi = 1;
2765 ilo = 0;
2766 }
2767 else if (byteorder == 1) {
2768 /* force BE */
2769 ihi = 0;
2770 ilo = 1;
2771 }
2772
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002773 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002774 Py_UNICODE ch = *s++;
2775 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002776#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002777 if (ch >= 0x10000) {
2778 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2779 ch = 0xD800 | ((ch-0x10000) >> 10);
2780 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002781#endif
Tim Peters772747b2001-08-09 22:21:55 +00002782 STORECHAR(ch);
2783 if (ch2)
2784 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002787#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788}
2789
2790PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2791{
2792 if (!PyUnicode_Check(unicode)) {
2793 PyErr_BadArgument();
2794 return NULL;
2795 }
2796 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002797 PyUnicode_GET_SIZE(unicode),
2798 NULL,
2799 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800}
2801
2802/* --- Unicode Escape Codec ----------------------------------------------- */
2803
Fredrik Lundh06d12682001-01-24 07:59:11 +00002804static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002805
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002807 Py_ssize_t size,
2808 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002811 Py_ssize_t startinpos;
2812 Py_ssize_t endinpos;
2813 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002815 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002817 char* message;
2818 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002819 PyObject *errorHandler = NULL;
2820 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002821
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 /* Escaped strings will always be longer than the resulting
2823 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824 length after conversion to the true value.
2825 (but if the error callback returns a long replacement string
2826 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827 v = _PyUnicode_New(size);
2828 if (v == NULL)
2829 goto onError;
2830 if (size == 0)
2831 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002835
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 while (s < end) {
2837 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002838 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002839 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840
2841 /* Non-escape characters are interpreted as Unicode ordinals */
2842 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002843 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844 continue;
2845 }
2846
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 /* \ - Escapes */
2849 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002850 c = *s++;
2851 if (s > end)
2852 c = '\0'; /* Invalid after \ */
2853 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002855 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 case '\n': break;
2857 case '\\': *p++ = '\\'; break;
2858 case '\'': *p++ = '\''; break;
2859 case '\"': *p++ = '\"'; break;
2860 case 'b': *p++ = '\b'; break;
2861 case 'f': *p++ = '\014'; break; /* FF */
2862 case 't': *p++ = '\t'; break;
2863 case 'n': *p++ = '\n'; break;
2864 case 'r': *p++ = '\r'; break;
2865 case 'v': *p++ = '\013'; break; /* VT */
2866 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2867
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002868 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 case '0': case '1': case '2': case '3':
2870 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002871 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002872 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002873 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002874 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002875 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002877 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878 break;
2879
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002880 /* hex escapes */
2881 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002883 digits = 2;
2884 message = "truncated \\xXX escape";
2885 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002887 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002889 digits = 4;
2890 message = "truncated \\uXXXX escape";
2891 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002893 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002894 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002895 digits = 8;
2896 message = "truncated \\UXXXXXXXX escape";
2897 hexescape:
2898 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002899 if (end - s < digits) {
2900 /* count only hex digits */
2901 for (; s < end; ++s) {
2902 c = (unsigned char)*s;
2903 if (!Py_ISXDIGIT(c))
2904 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002905 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002906 goto error;
2907 }
2908 for (; digits--; ++s) {
2909 c = (unsigned char)*s;
2910 if (!Py_ISXDIGIT(c))
2911 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002912 chr = (chr<<4) & ~0xF;
2913 if (c >= '0' && c <= '9')
2914 chr += c - '0';
2915 else if (c >= 'a' && c <= 'f')
2916 chr += 10 + c - 'a';
2917 else
2918 chr += 10 + c - 'A';
2919 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002920 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002921 /* _decoding_error will have already written into the
2922 target buffer. */
2923 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002924 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002925 /* when we get here, chr is a 32-bit unicode character */
2926 if (chr <= 0xffff)
2927 /* UCS-2 character */
2928 *p++ = (Py_UNICODE) chr;
2929 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002930 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002931 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002932#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002933 *p++ = chr;
2934#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002935 chr -= 0x10000L;
2936 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002937 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002938#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002939 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002940 message = "illegal Unicode character";
2941 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002942 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002943 break;
2944
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002945 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002946 case 'N':
2947 message = "malformed \\N character escape";
2948 if (ucnhash_CAPI == NULL) {
2949 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002950 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002951 if (ucnhash_CAPI == NULL)
2952 goto ucnhashError;
2953 }
2954 if (*s == '{') {
2955 const char *start = s+1;
2956 /* look for the closing brace */
2957 while (*s != '}' && s < end)
2958 s++;
2959 if (s > start && s < end && *s == '}') {
2960 /* found a name. look it up in the unicode database */
2961 message = "unknown Unicode character name";
2962 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002963 if (s - start - 1 <= INT_MAX &&
2964 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002965 goto store;
2966 }
2967 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002968 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002969
2970 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002971 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972 message = "\\ at end of string";
2973 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002974 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002975 }
2976 else {
2977 *p++ = '\\';
2978 *p++ = (unsigned char)s[-1];
2979 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002980 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002982 continue;
2983
2984 error:
2985 endinpos = s-starts;
2986 outpos = p-PyUnicode_AS_UNICODE(v);
2987 if (unicode_decode_call_errorhandler(
2988 errors, &errorHandler,
2989 "unicodeescape", message,
2990 starts, size, &startinpos, &endinpos, &exc, &s,
2991 &v, &outpos, &p))
2992 goto onError;
2993 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002995 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002997 Py_XDECREF(errorHandler);
2998 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003000
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003001 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003002 PyErr_SetString(
3003 PyExc_UnicodeError,
3004 "\\N escapes not supported (can't load unicodedata module)"
3005 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003006 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003007 Py_XDECREF(errorHandler);
3008 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003009 return NULL;
3010
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003011 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003013 Py_XDECREF(errorHandler);
3014 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 return NULL;
3016}
3017
3018/* Return a Unicode-Escape string version of the Unicode object.
3019
3020 If quotes is true, the string is enclosed in u"" or u'' quotes as
3021 appropriate.
3022
3023*/
3024
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00003025Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003026 Py_ssize_t size,
3027 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00003028{
3029 /* like wcschr, but doesn't stop at NULL characters */
3030
3031 while (size-- > 0) {
3032 if (*s == ch)
3033 return s;
3034 s++;
3035 }
3036
3037 return NULL;
3038}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003039
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040static
3041PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003042 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 int quotes)
3044{
3045 PyObject *repr;
3046 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003048 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00003049#ifdef Py_UNICODE_WIDE
3050 const Py_ssize_t expandsize = 10;
3051#else
3052 const Py_ssize_t expandsize = 6;
3053#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054
Neal Norwitz17753ec2006-08-21 22:21:19 +00003055 /* XXX(nnorwitz): rather than over-allocating, it would be
3056 better to choose a different scheme. Perhaps scan the
3057 first N-chars of the string and allocate based on that size.
3058 */
3059 /* Initial allocation is based on the longest-possible unichr
3060 escape.
3061
3062 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3063 unichr, so in this case it's the longest unichr escape. In
3064 narrow (UTF-16) builds this is five chars per source unichr
3065 since there are two unichrs in the surrogate pair, so in narrow
3066 (UTF-16) builds it's not the longest unichr escape.
3067
3068 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3069 so in the narrow (UTF-16) build case it's the longest unichr
3070 escape.
3071 */
3072
Neal Norwitze7d8be82008-07-31 17:17:14 +00003073 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003074 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003075
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003076 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003077 2
3078 + expandsize*size
3079 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 if (repr == NULL)
3081 return NULL;
3082
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003083 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084
3085 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003087 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 !findchar(s, size, '"')) ? '"' : '\'';
3089 }
3090 while (size-- > 0) {
3091 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003092
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003093 /* Escape quotes and backslashes */
3094 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003095 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 *p++ = '\\';
3097 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003098 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003099 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003100
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003102 /* Map 21-bit characters to '\U00xxxxxx' */
3103 else if (ch >= 0x10000) {
3104 *p++ = '\\';
3105 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003106 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3107 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3108 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3109 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3110 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3111 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3112 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003113 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003114 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003115 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003116#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003117 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3118 else if (ch >= 0xD800 && ch < 0xDC00) {
3119 Py_UNICODE ch2;
3120 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003121
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003122 ch2 = *s++;
3123 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003124 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003125 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3126 *p++ = '\\';
3127 *p++ = 'U';
3128 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3129 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3130 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3131 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3132 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3133 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3134 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3135 *p++ = hexdigit[ucs & 0x0000000F];
3136 continue;
3137 }
3138 /* Fall through: isolated surrogates are copied as-is */
3139 s--;
3140 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003141 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003142#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003143
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003145 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 *p++ = '\\';
3147 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003148 *p++ = hexdigit[(ch >> 12) & 0x000F];
3149 *p++ = hexdigit[(ch >> 8) & 0x000F];
3150 *p++ = hexdigit[(ch >> 4) & 0x000F];
3151 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003153
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003154 /* Map special whitespace to '\t', \n', '\r' */
3155 else if (ch == '\t') {
3156 *p++ = '\\';
3157 *p++ = 't';
3158 }
3159 else if (ch == '\n') {
3160 *p++ = '\\';
3161 *p++ = 'n';
3162 }
3163 else if (ch == '\r') {
3164 *p++ = '\\';
3165 *p++ = 'r';
3166 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003167
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003168 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003169 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003171 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003172 *p++ = hexdigit[(ch >> 4) & 0x000F];
3173 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003174 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003175
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 /* Copy everything else as-is */
3177 else
3178 *p++ = (char) ch;
3179 }
3180 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003181 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182
3183 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003184 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3185 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003186 return repr;
3187}
3188
3189PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003190 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191{
3192 return unicodeescape_string(s, size, 0);
3193}
3194
3195PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3196{
3197 if (!PyUnicode_Check(unicode)) {
3198 PyErr_BadArgument();
3199 return NULL;
3200 }
3201 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003202 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203}
3204
3205/* --- Raw Unicode Escape Codec ------------------------------------------- */
3206
3207PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003208 Py_ssize_t size,
3209 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003212 Py_ssize_t startinpos;
3213 Py_ssize_t endinpos;
3214 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 const char *end;
3218 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003219 PyObject *errorHandler = NULL;
3220 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003221
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 /* Escaped strings will always be longer than the resulting
3223 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003224 length after conversion to the true value. (But decoding error
3225 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 v = _PyUnicode_New(size);
3227 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003228 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003230 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003231 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 end = s + size;
3233 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003234 unsigned char c;
3235 Py_UCS4 x;
3236 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003237 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003239 /* Non-escape characters are interpreted as Unicode ordinals */
3240 if (*s != '\\') {
3241 *p++ = (unsigned char)*s++;
3242 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003243 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003244 startinpos = s-starts;
3245
3246 /* \u-escapes are only interpreted iff the number of leading
3247 backslashes if odd */
3248 bs = s;
3249 for (;s < end;) {
3250 if (*s != '\\')
3251 break;
3252 *p++ = (unsigned char)*s++;
3253 }
3254 if (((s - bs) & 1) == 0 ||
3255 s >= end ||
3256 (*s != 'u' && *s != 'U')) {
3257 continue;
3258 }
3259 p--;
3260 count = *s=='u' ? 4 : 8;
3261 s++;
3262
3263 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3264 outpos = p-PyUnicode_AS_UNICODE(v);
3265 for (x = 0, i = 0; i < count; ++i, ++s) {
3266 c = (unsigned char)*s;
3267 if (!isxdigit(c)) {
3268 endinpos = s-starts;
3269 if (unicode_decode_call_errorhandler(
3270 errors, &errorHandler,
3271 "rawunicodeescape", "truncated \\uXXXX",
3272 starts, size, &startinpos, &endinpos, &exc, &s,
3273 &v, &outpos, &p))
3274 goto onError;
3275 goto nextByte;
3276 }
3277 x = (x<<4) & ~0xF;
3278 if (c >= '0' && c <= '9')
3279 x += c - '0';
3280 else if (c >= 'a' && c <= 'f')
3281 x += 10 + c - 'a';
3282 else
3283 x += 10 + c - 'A';
3284 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003285 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003286 /* UCS-2 character */
3287 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003288 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003289 /* UCS-4 character. Either store directly, or as
3290 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003291#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003292 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003293#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003294 x -= 0x10000L;
3295 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3296 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003297#endif
3298 } else {
3299 endinpos = s-starts;
3300 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003301 if (unicode_decode_call_errorhandler(
3302 errors, &errorHandler,
3303 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003304 starts, size, &startinpos, &endinpos, &exc, &s,
3305 &v, &outpos, &p))
3306 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003307 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003308 nextByte:
3309 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003311 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003312 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 Py_XDECREF(errorHandler);
3314 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003316
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003317 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 Py_XDECREF(errorHandler);
3320 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 return NULL;
3322}
3323
3324PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003325 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326{
3327 PyObject *repr;
3328 char *p;
3329 char *q;
3330
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003331 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003332#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003333 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003334#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003335 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003336#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003337
Neal Norwitze7d8be82008-07-31 17:17:14 +00003338 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003339 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003340
Neal Norwitze7d8be82008-07-31 17:17:14 +00003341 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342 if (repr == NULL)
3343 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003344 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003345 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003347 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 while (size-- > 0) {
3349 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003350#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003351 /* Map 32-bit characters to '\Uxxxxxxxx' */
3352 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003353 *p++ = '\\';
3354 *p++ = 'U';
3355 *p++ = hexdigit[(ch >> 28) & 0xf];
3356 *p++ = hexdigit[(ch >> 24) & 0xf];
3357 *p++ = hexdigit[(ch >> 20) & 0xf];
3358 *p++ = hexdigit[(ch >> 16) & 0xf];
3359 *p++ = hexdigit[(ch >> 12) & 0xf];
3360 *p++ = hexdigit[(ch >> 8) & 0xf];
3361 *p++ = hexdigit[(ch >> 4) & 0xf];
3362 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003363 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003364 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003365#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003366 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3367 if (ch >= 0xD800 && ch < 0xDC00) {
3368 Py_UNICODE ch2;
3369 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003370
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003371 ch2 = *s++;
3372 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003373 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003374 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3375 *p++ = '\\';
3376 *p++ = 'U';
3377 *p++ = hexdigit[(ucs >> 28) & 0xf];
3378 *p++ = hexdigit[(ucs >> 24) & 0xf];
3379 *p++ = hexdigit[(ucs >> 20) & 0xf];
3380 *p++ = hexdigit[(ucs >> 16) & 0xf];
3381 *p++ = hexdigit[(ucs >> 12) & 0xf];
3382 *p++ = hexdigit[(ucs >> 8) & 0xf];
3383 *p++ = hexdigit[(ucs >> 4) & 0xf];
3384 *p++ = hexdigit[ucs & 0xf];
3385 continue;
3386 }
3387 /* Fall through: isolated surrogates are copied as-is */
3388 s--;
3389 size++;
3390 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003391#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003392 /* Map 16-bit characters to '\uxxxx' */
3393 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 *p++ = '\\';
3395 *p++ = 'u';
3396 *p++ = hexdigit[(ch >> 12) & 0xf];
3397 *p++ = hexdigit[(ch >> 8) & 0xf];
3398 *p++ = hexdigit[(ch >> 4) & 0xf];
3399 *p++ = hexdigit[ch & 15];
3400 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003401 /* Copy everything else as-is */
3402 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 *p++ = (char) ch;
3404 }
3405 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003406 if (_PyString_Resize(&repr, p - q))
3407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408 return repr;
3409}
3410
3411PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3412{
3413 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003414 PyErr_BadArgument();
3415 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 }
3417 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003418 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419}
3420
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003421/* --- Unicode Internal Codec ------------------------------------------- */
3422
3423PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003424 Py_ssize_t size,
3425 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003426{
3427 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003428 Py_ssize_t startinpos;
3429 Py_ssize_t endinpos;
3430 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003431 PyUnicodeObject *v;
3432 Py_UNICODE *p;
3433 const char *end;
3434 const char *reason;
3435 PyObject *errorHandler = NULL;
3436 PyObject *exc = NULL;
3437
Neal Norwitzd43069c2006-01-08 01:12:10 +00003438#ifdef Py_UNICODE_WIDE
3439 Py_UNICODE unimax = PyUnicode_GetMax();
3440#endif
3441
Armin Rigo7ccbca92006-10-04 12:17:45 +00003442 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003443 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3444 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003445 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003446 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003447 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003448 p = PyUnicode_AS_UNICODE(v);
3449 end = s + size;
3450
3451 while (s < end) {
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003452 if (end-s < Py_UNICODE_SIZE) {
3453 endinpos = end-starts;
3454 reason = "truncated input";
3455 goto error;
3456 }
Neal Norwitz1004a532006-05-15 07:17:23 +00003457 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003458#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003459 /* We have to sanity check the raw data, otherwise doom looms for
3460 some malformed UCS-4 data. */
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003461 if (*p > unimax || *p < 0) {
3462 endinpos = s - starts + Py_UNICODE_SIZE;
3463 reason = "illegal code point (> 0x10FFFF)";
3464 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003465 }
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003466#endif
3467 p++;
3468 s += Py_UNICODE_SIZE;
3469 continue;
3470
3471 error:
3472 startinpos = s - starts;
3473 outpos = p - PyUnicode_AS_UNICODE(v);
3474 if (unicode_decode_call_errorhandler(
3475 errors, &errorHandler,
3476 "unicode_internal", reason,
3477 starts, size, &startinpos, &endinpos, &exc, &s,
3478 &v, &outpos, &p)) {
3479 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003480 }
3481 }
3482
Martin v. Löwis412fb672006-04-13 06:34:32 +00003483 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003484 goto onError;
3485 Py_XDECREF(errorHandler);
3486 Py_XDECREF(exc);
3487 return (PyObject *)v;
3488
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003489 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003490 Py_XDECREF(v);
3491 Py_XDECREF(errorHandler);
3492 Py_XDECREF(exc);
3493 return NULL;
3494}
3495
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496/* --- Latin-1 Codec ------------------------------------------------------ */
3497
3498PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003499 Py_ssize_t size,
3500 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501{
3502 PyUnicodeObject *v;
3503 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003504
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003506 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003507 Py_UNICODE r = *(unsigned char*)s;
3508 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003509 }
3510
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 v = _PyUnicode_New(size);
3512 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003513 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003515 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 p = PyUnicode_AS_UNICODE(v);
3517 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003518 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003520
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003521 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 Py_XDECREF(v);
3523 return NULL;
3524}
3525
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526/* create or adjust a UnicodeEncodeError */
3527static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003528 const char *encoding,
3529 const Py_UNICODE *unicode, Py_ssize_t size,
3530 Py_ssize_t startpos, Py_ssize_t endpos,
3531 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003534 *exceptionObject = PyUnicodeEncodeError_Create(
3535 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 }
3537 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003538 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3539 goto onError;
3540 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3541 goto onError;
3542 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3543 goto onError;
3544 return;
3545 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02003546 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547 }
3548}
3549
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550/* raises a UnicodeEncodeError */
3551static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003552 const char *encoding,
3553 const Py_UNICODE *unicode, Py_ssize_t size,
3554 Py_ssize_t startpos, Py_ssize_t endpos,
3555 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556{
3557 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003558 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003560 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561}
3562
3563/* error handling callback helper:
3564 build arguments, call the callback and check the arguments,
3565 put the result into newpos and return the replacement string, which
3566 has to be freed by the caller */
3567static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003568 PyObject **errorHandler,
3569 const char *encoding, const char *reason,
3570 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3571 Py_ssize_t startpos, Py_ssize_t endpos,
3572 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003574 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575
3576 PyObject *restuple;
3577 PyObject *resunicode;
3578
3579 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003580 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003582 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 }
3584
3585 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003586 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003588 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589
3590 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003591 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003593 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003595 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003596 Py_DECREF(restuple);
3597 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 }
3599 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003600 &resunicode, newpos)) {
3601 Py_DECREF(restuple);
3602 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 }
3604 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003605 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003606 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003607 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3608 Py_DECREF(restuple);
3609 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003610 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 Py_INCREF(resunicode);
3612 Py_DECREF(restuple);
3613 return resunicode;
3614}
3615
3616static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003617 Py_ssize_t size,
3618 const char *errors,
3619 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620{
3621 /* output object */
3622 PyObject *res;
3623 /* pointers to the beginning and end+1 of input */
3624 const Py_UNICODE *startp = p;
3625 const Py_UNICODE *endp = p + size;
3626 /* pointer to the beginning of the unencodable characters */
3627 /* const Py_UNICODE *badp = NULL; */
3628 /* pointer into the output */
3629 char *str;
3630 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003631 Py_ssize_t respos = 0;
3632 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003633 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3634 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 PyObject *errorHandler = NULL;
3636 PyObject *exc = NULL;
3637 /* the following variable is used for caching string comparisons
3638 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3639 int known_errorHandler = -1;
3640
3641 /* allocate enough for a simple encoding without
3642 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003643 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 if (res == NULL)
3645 goto onError;
3646 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003647 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003648 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 ressize = size;
3650
3651 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003652 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003654 /* can we encode this? */
3655 if (c<limit) {
3656 /* no overflow check, because we know that the space is enough */
3657 *str++ = (char)c;
3658 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003659 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003660 else {
3661 Py_ssize_t unicodepos = p-startp;
3662 Py_ssize_t requiredsize;
3663 PyObject *repunicode;
3664 Py_ssize_t repsize;
3665 Py_ssize_t newpos;
3666 Py_ssize_t respos;
3667 Py_UNICODE *uni2;
3668 /* startpos for collecting unencodable chars */
3669 const Py_UNICODE *collstart = p;
3670 const Py_UNICODE *collend = p;
3671 /* find all unecodable characters */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003672 while ((collend < endp) && ((*collend) >= limit))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003673 ++collend;
3674 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3675 if (known_errorHandler==-1) {
3676 if ((errors==NULL) || (!strcmp(errors, "strict")))
3677 known_errorHandler = 1;
3678 else if (!strcmp(errors, "replace"))
3679 known_errorHandler = 2;
3680 else if (!strcmp(errors, "ignore"))
3681 known_errorHandler = 3;
3682 else if (!strcmp(errors, "xmlcharrefreplace"))
3683 known_errorHandler = 4;
3684 else
3685 known_errorHandler = 0;
3686 }
3687 switch (known_errorHandler) {
3688 case 1: /* strict */
3689 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3690 goto onError;
3691 case 2: /* replace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003692 while (collstart++ < collend)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003693 *str++ = '?'; /* fall through */
3694 case 3: /* ignore */
3695 p = collend;
3696 break;
3697 case 4: /* xmlcharrefreplace */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003698 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003699 /* determine replacement size (temporarily (mis)uses p) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003700 requiredsize = respos;
3701 for (p = collstart; p < collend;) {
Serhiy Storchakae822b032013-08-06 16:56:26 +03003702 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003703 Py_ssize_t incr;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003704 if (ch < 10)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003705 incr = 2+1+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003706 else if (ch < 100)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003707 incr = 2+2+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003708 else if (ch < 1000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003709 incr = 2+3+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003710 else if (ch < 10000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003711 incr = 2+4+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003712 else if (ch < 100000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003713 incr = 2+5+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003714 else if (ch < 1000000)
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003715 incr = 2+6+1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003716 else
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003717 incr = 2+7+1;
3718 if (requiredsize > PY_SSIZE_T_MAX - incr)
3719 goto overflow;
3720 requiredsize += incr;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003721 }
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003722 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3723 goto overflow;
3724 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003725 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003726 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003727 requiredsize = 2*ressize;
3728 if (_PyString_Resize(&res, requiredsize))
3729 goto onError;
3730 str = PyString_AS_STRING(res) + respos;
3731 ressize = requiredsize;
3732 }
3733 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03003734 for (p = collstart; p < collend;) {
3735 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3736 str += sprintf(str, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003737 }
3738 p = collend;
3739 break;
3740 default:
3741 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3742 encoding, reason, startp, size, &exc,
3743 collstart-startp, collend-startp, &newpos);
3744 if (repunicode == NULL)
3745 goto onError;
3746 /* need more space? (at least enough for what we have+the
3747 replacement+the rest of the string, so we won't have to
3748 check space for encodable characters) */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003749 respos = str - PyString_AS_STRING(res);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003750 repsize = PyUnicode_GET_SIZE(repunicode);
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003751 if (respos > PY_SSIZE_T_MAX - repsize)
3752 goto overflow;
3753 requiredsize = respos + repsize;
3754 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3755 goto overflow;
3756 requiredsize += endp - collend;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003757 if (requiredsize > ressize) {
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003758 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003759 requiredsize = 2*ressize;
3760 if (_PyString_Resize(&res, requiredsize)) {
3761 Py_DECREF(repunicode);
3762 goto onError;
3763 }
3764 str = PyString_AS_STRING(res) + respos;
3765 ressize = requiredsize;
3766 }
3767 /* check if there is anything unencodable in the replacement
3768 and copy it to the output */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003769 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003770 c = *uni2;
3771 if (c >= limit) {
3772 raise_encode_exception(&exc, encoding, startp, size,
3773 unicodepos, unicodepos+1, reason);
3774 Py_DECREF(repunicode);
3775 goto onError;
3776 }
3777 *str = (char)c;
3778 }
3779 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003780 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003781 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003782 }
3783 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 /* Resize if we allocated to much */
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003785 respos = str - PyString_AS_STRING(res);
3786 if (respos < ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003787 /* If this falls res will be NULL */
3788 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003789 Py_XDECREF(errorHandler);
3790 Py_XDECREF(exc);
3791 return res;
3792
Benjamin Petersoned4c1302014-09-29 18:18:57 -04003793 overflow:
3794 PyErr_SetString(PyExc_OverflowError,
3795 "encoded result is too long for a Python string");
3796
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003797 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 Py_XDECREF(res);
3799 Py_XDECREF(errorHandler);
3800 Py_XDECREF(exc);
3801 return NULL;
3802}
3803
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003805 Py_ssize_t size,
3806 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809}
3810
3811PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3812{
3813 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003814 PyErr_BadArgument();
3815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 }
3817 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003818 PyUnicode_GET_SIZE(unicode),
3819 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820}
3821
3822/* --- 7-bit ASCII Codec -------------------------------------------------- */
3823
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003825 Py_ssize_t size,
3826 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 PyUnicodeObject *v;
3830 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003831 Py_ssize_t startinpos;
3832 Py_ssize_t endinpos;
3833 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003834 const char *e;
3835 PyObject *errorHandler = NULL;
3836 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003837
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003839 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003840 Py_UNICODE r = *(unsigned char*)s;
3841 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003842 }
Tim Petersced69f82003-09-16 20:30:58 +00003843
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 v = _PyUnicode_New(size);
3845 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003846 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003848 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003850 e = s + size;
3851 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003852 register unsigned char c = (unsigned char)*s;
3853 if (c < 128) {
3854 *p++ = c;
3855 ++s;
3856 }
3857 else {
3858 startinpos = s-starts;
3859 endinpos = startinpos + 1;
3860 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3861 if (unicode_decode_call_errorhandler(
3862 errors, &errorHandler,
3863 "ascii", "ordinal not in range(128)",
3864 starts, size, &startinpos, &endinpos, &exc, &s,
3865 &v, &outpos, &p))
3866 goto onError;
3867 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003869 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003870 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3871 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003872 Py_XDECREF(errorHandler);
3873 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003875
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003876 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003878 Py_XDECREF(errorHandler);
3879 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880 return NULL;
3881}
3882
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003884 Py_ssize_t size,
3885 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003887 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888}
3889
3890PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3891{
3892 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003893 PyErr_BadArgument();
3894 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 }
3896 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003897 PyUnicode_GET_SIZE(unicode),
3898 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003899}
3900
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003901#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003902
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003903/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003904
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003905#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003906#define NEED_RETRY
3907#endif
3908
3909/* XXX This code is limited to "true" double-byte encodings, as
3910 a) it assumes an incomplete character consists of a single byte, and
3911 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003912 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003913
3914static int is_dbcs_lead_byte(const char *s, int offset)
3915{
3916 const char *curr = s + offset;
3917
3918 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003919 const char *prev = CharPrev(s, curr);
3920 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003921 }
3922 return 0;
3923}
3924
3925/*
3926 * Decode MBCS string into unicode object. If 'final' is set, converts
3927 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3928 */
3929static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003930 const char *s, /* MBCS string */
3931 int size, /* sizeof MBCS string */
3932 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003933{
3934 Py_UNICODE *p;
3935 Py_ssize_t n = 0;
3936 int usize = 0;
3937
3938 assert(size >= 0);
3939
3940 /* Skip trailing lead-byte unless 'final' is set */
3941 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003942 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003943
3944 /* First get the size of the result */
3945 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003946 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3947 if (usize == 0) {
3948 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3949 return -1;
3950 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003951 }
3952
3953 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003954 /* Create unicode object */
3955 *v = _PyUnicode_New(usize);
3956 if (*v == NULL)
3957 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003958 }
3959 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003960 /* Extend unicode object */
3961 n = PyUnicode_GET_SIZE(*v);
3962 if (_PyUnicode_Resize(v, n + usize) < 0)
3963 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003964 }
3965
3966 /* Do the conversion */
3967 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003968 p = PyUnicode_AS_UNICODE(*v) + n;
3969 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3970 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3971 return -1;
3972 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003973 }
3974
3975 return size;
3976}
3977
3978PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003979 Py_ssize_t size,
3980 const char *errors,
3981 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003982{
3983 PyUnicodeObject *v = NULL;
3984 int done;
3985
3986 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003987 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003988
3989#ifdef NEED_RETRY
3990 retry:
3991 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003992 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003993 else
3994#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003995 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003996
3997 if (done < 0) {
3998 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003999 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004000 }
4001
4002 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004003 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004004
4005#ifdef NEED_RETRY
4006 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004007 s += done;
4008 size -= done;
4009 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004010 }
4011#endif
4012
4013 return (PyObject *)v;
4014}
4015
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004016PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004017 Py_ssize_t size,
4018 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004019{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004020 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4021}
4022
4023/*
4024 * Convert unicode into string object (MBCS).
4025 * Returns 0 if succeed, -1 otherwise.
4026 */
4027static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004028 const Py_UNICODE *p, /* unicode */
4029 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004030{
4031 int mbcssize = 0;
4032 Py_ssize_t n = 0;
4033
4034 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004035
4036 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00004037 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004038 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4039 if (mbcssize == 0) {
4040 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4041 return -1;
4042 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004043 }
4044
Martin v. Löwisd8251432006-06-14 05:21:04 +00004045 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004046 /* Create string object */
4047 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4048 if (*repr == NULL)
4049 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004050 }
4051 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004052 /* Extend string object */
4053 n = PyString_Size(*repr);
4054 if (_PyString_Resize(repr, n + mbcssize) < 0)
4055 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004056 }
4057
4058 /* Do the conversion */
4059 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004060 char *s = PyString_AS_STRING(*repr) + n;
4061 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4062 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4063 return -1;
4064 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004065 }
4066
4067 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004068}
4069
4070PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004071 Py_ssize_t size,
4072 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004073{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004074 PyObject *repr = NULL;
4075 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004076
Martin v. Löwisd8251432006-06-14 05:21:04 +00004077#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004078 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004079 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004080 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004081 else
4082#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004083 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004084
Martin v. Löwisd8251432006-06-14 05:21:04 +00004085 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004086 Py_XDECREF(repr);
4087 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004088 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004089
4090#ifdef NEED_RETRY
4091 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004092 p += INT_MAX;
4093 size -= INT_MAX;
4094 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004095 }
4096#endif
4097
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004098 return repr;
4099}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004100
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004101PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4102{
4103 if (!PyUnicode_Check(unicode)) {
4104 PyErr_BadArgument();
4105 return NULL;
4106 }
4107 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004108 PyUnicode_GET_SIZE(unicode),
4109 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004110}
4111
Martin v. Löwisd8251432006-06-14 05:21:04 +00004112#undef NEED_RETRY
4113
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004114#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004115
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116/* --- Character Mapping Codec -------------------------------------------- */
4117
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004119 Py_ssize_t size,
4120 PyObject *mapping,
4121 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004124 Py_ssize_t startinpos;
4125 Py_ssize_t endinpos;
4126 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 PyUnicodeObject *v;
4129 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004130 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 PyObject *errorHandler = NULL;
4132 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004133 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004134 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004135
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 /* Default to Latin-1 */
4137 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004138 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139
4140 v = _PyUnicode_New(size);
4141 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004142 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004144 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004147 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004148 mapstring = PyUnicode_AS_UNICODE(mapping);
4149 maplen = PyUnicode_GET_SIZE(mapping);
4150 while (s < e) {
4151 unsigned char ch = *s;
4152 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004154 if (ch < maplen)
4155 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004157 if (x == 0xfffe) {
4158 /* undefined mapping */
4159 outpos = p-PyUnicode_AS_UNICODE(v);
4160 startinpos = s-starts;
4161 endinpos = startinpos+1;
4162 if (unicode_decode_call_errorhandler(
4163 errors, &errorHandler,
4164 "charmap", "character maps to <undefined>",
4165 starts, size, &startinpos, &endinpos, &exc, &s,
4166 &v, &outpos, &p)) {
4167 goto onError;
4168 }
4169 continue;
4170 }
4171 *p++ = x;
4172 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004173 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004174 }
4175 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004176 while (s < e) {
4177 unsigned char ch = *s;
4178 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004179
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004180 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4181 w = PyInt_FromLong((long)ch);
4182 if (w == NULL)
4183 goto onError;
4184 x = PyObject_GetItem(mapping, w);
4185 Py_DECREF(w);
4186 if (x == NULL) {
4187 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4188 /* No mapping found means: mapping is undefined. */
4189 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004190 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004191 } else
4192 goto onError;
4193 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004194
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004195 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004196 if (x == Py_None)
4197 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004198 if (PyInt_Check(x)) {
4199 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004200 if (value == 0xFFFE)
4201 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004202 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004203 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004204 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004205 Py_DECREF(x);
4206 goto onError;
4207 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004208
4209#ifndef Py_UNICODE_WIDE
4210 if (value > 0xFFFF) {
4211 /* see the code for 1-n mapping below */
4212 if (extrachars < 2) {
4213 /* resize first */
4214 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4215 Py_ssize_t needed = 10 - extrachars;
4216 extrachars += needed;
4217 /* XXX overflow detection missing */
4218 if (_PyUnicode_Resize(&v,
4219 PyUnicode_GET_SIZE(v) + needed) < 0) {
4220 Py_DECREF(x);
4221 goto onError;
4222 }
4223 p = PyUnicode_AS_UNICODE(v) + oldpos;
4224 }
4225 value -= 0x10000;
4226 *p++ = 0xD800 | (value >> 10);
4227 *p++ = 0xDC00 | (value & 0x3FF);
4228 extrachars -= 2;
4229 }
4230 else
4231#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004232 *p++ = (Py_UNICODE)value;
4233 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004234 else if (PyUnicode_Check(x)) {
4235 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004236
Serhiy Storchaka95997452013-01-15 14:42:59 +02004237 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004238 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004239 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4240 if (value == 0xFFFE)
4241 goto Undefined;
4242 *p++ = value;
4243 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004244 else if (targetsize > 1) {
4245 /* 1-n mapping */
4246 if (targetsize > extrachars) {
4247 /* resize first */
4248 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4249 Py_ssize_t needed = (targetsize - extrachars) + \
4250 (targetsize << 2);
4251 extrachars += needed;
4252 /* XXX overflow detection missing */
4253 if (_PyUnicode_Resize(&v,
4254 PyUnicode_GET_SIZE(v) + needed) < 0) {
4255 Py_DECREF(x);
4256 goto onError;
4257 }
4258 p = PyUnicode_AS_UNICODE(v) + oldpos;
4259 }
4260 Py_UNICODE_COPY(p,
4261 PyUnicode_AS_UNICODE(x),
4262 targetsize);
4263 p += targetsize;
4264 extrachars -= targetsize;
4265 }
4266 /* 1-0 mapping: skip the character */
4267 }
4268 else {
4269 /* wrong return value */
4270 PyErr_SetString(PyExc_TypeError,
4271 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004272 Py_DECREF(x);
4273 goto onError;
4274 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004275 Py_DECREF(x);
4276 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004277 continue;
4278Undefined:
4279 /* undefined mapping */
4280 Py_XDECREF(x);
4281 outpos = p-PyUnicode_AS_UNICODE(v);
4282 startinpos = s-starts;
4283 endinpos = startinpos+1;
4284 if (unicode_decode_call_errorhandler(
4285 errors, &errorHandler,
4286 "charmap", "character maps to <undefined>",
4287 starts, size, &startinpos, &endinpos, &exc, &s,
4288 &v, &outpos, &p)) {
4289 goto onError;
4290 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 }
4293 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004294 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4295 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 Py_XDECREF(errorHandler);
4297 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004299
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004300 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 Py_XDECREF(errorHandler);
4302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303 Py_XDECREF(v);
4304 return NULL;
4305}
4306
Martin v. Löwis3f767792006-06-04 19:36:28 +00004307/* Charmap encoding: the lookup table */
4308
4309struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004310 PyObject_HEAD
4311 unsigned char level1[32];
4312 int count2, count3;
4313 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004314};
4315
4316static PyObject*
4317encoding_map_size(PyObject *obj, PyObject* args)
4318{
4319 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004320 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004321 128*map->count3);
4322}
4323
4324static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004325 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004326 PyDoc_STR("Return the size (in bytes) of this object") },
4327 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004328};
4329
4330static void
4331encoding_map_dealloc(PyObject* o)
4332{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004333 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004334}
4335
4336static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004337 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004338 "EncodingMap", /*tp_name*/
4339 sizeof(struct encoding_map), /*tp_basicsize*/
4340 0, /*tp_itemsize*/
4341 /* methods */
4342 encoding_map_dealloc, /*tp_dealloc*/
4343 0, /*tp_print*/
4344 0, /*tp_getattr*/
4345 0, /*tp_setattr*/
4346 0, /*tp_compare*/
4347 0, /*tp_repr*/
4348 0, /*tp_as_number*/
4349 0, /*tp_as_sequence*/
4350 0, /*tp_as_mapping*/
4351 0, /*tp_hash*/
4352 0, /*tp_call*/
4353 0, /*tp_str*/
4354 0, /*tp_getattro*/
4355 0, /*tp_setattro*/
4356 0, /*tp_as_buffer*/
4357 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4358 0, /*tp_doc*/
4359 0, /*tp_traverse*/
4360 0, /*tp_clear*/
4361 0, /*tp_richcompare*/
4362 0, /*tp_weaklistoffset*/
4363 0, /*tp_iter*/
4364 0, /*tp_iternext*/
4365 encoding_map_methods, /*tp_methods*/
4366 0, /*tp_members*/
4367 0, /*tp_getset*/
4368 0, /*tp_base*/
4369 0, /*tp_dict*/
4370 0, /*tp_descr_get*/
4371 0, /*tp_descr_set*/
4372 0, /*tp_dictoffset*/
4373 0, /*tp_init*/
4374 0, /*tp_alloc*/
4375 0, /*tp_new*/
4376 0, /*tp_free*/
4377 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004378};
4379
4380PyObject*
4381PyUnicode_BuildEncodingMap(PyObject* string)
4382{
4383 Py_UNICODE *decode;
4384 PyObject *result;
4385 struct encoding_map *mresult;
4386 int i;
4387 int need_dict = 0;
4388 unsigned char level1[32];
4389 unsigned char level2[512];
4390 unsigned char *mlevel1, *mlevel2, *mlevel3;
4391 int count2 = 0, count3 = 0;
4392
4393 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4394 PyErr_BadArgument();
4395 return NULL;
4396 }
4397 decode = PyUnicode_AS_UNICODE(string);
4398 memset(level1, 0xFF, sizeof level1);
4399 memset(level2, 0xFF, sizeof level2);
4400
4401 /* If there isn't a one-to-one mapping of NULL to \0,
4402 or if there are non-BMP characters, we need to use
4403 a mapping dictionary. */
4404 if (decode[0] != 0)
4405 need_dict = 1;
4406 for (i = 1; i < 256; i++) {
4407 int l1, l2;
4408 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004409#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004410 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004411#endif
4412 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004413 need_dict = 1;
4414 break;
4415 }
4416 if (decode[i] == 0xFFFE)
4417 /* unmapped character */
4418 continue;
4419 l1 = decode[i] >> 11;
4420 l2 = decode[i] >> 7;
4421 if (level1[l1] == 0xFF)
4422 level1[l1] = count2++;
4423 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004424 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004425 }
4426
4427 if (count2 >= 0xFF || count3 >= 0xFF)
4428 need_dict = 1;
4429
4430 if (need_dict) {
4431 PyObject *result = PyDict_New();
4432 PyObject *key, *value;
4433 if (!result)
4434 return NULL;
4435 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004436 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004437 key = PyInt_FromLong(decode[i]);
4438 value = PyInt_FromLong(i);
4439 if (!key || !value)
4440 goto failed1;
4441 if (PyDict_SetItem(result, key, value) == -1)
4442 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004443 Py_DECREF(key);
4444 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004445 }
4446 return result;
4447 failed1:
4448 Py_XDECREF(key);
4449 Py_XDECREF(value);
4450 Py_DECREF(result);
4451 return NULL;
4452 }
4453
4454 /* Create a three-level trie */
4455 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4456 16*count2 + 128*count3 - 1);
4457 if (!result)
4458 return PyErr_NoMemory();
4459 PyObject_Init(result, &EncodingMapType);
4460 mresult = (struct encoding_map*)result;
4461 mresult->count2 = count2;
4462 mresult->count3 = count3;
4463 mlevel1 = mresult->level1;
4464 mlevel2 = mresult->level23;
4465 mlevel3 = mresult->level23 + 16*count2;
4466 memcpy(mlevel1, level1, 32);
4467 memset(mlevel2, 0xFF, 16*count2);
4468 memset(mlevel3, 0, 128*count3);
4469 count3 = 0;
4470 for (i = 1; i < 256; i++) {
4471 int o1, o2, o3, i2, i3;
4472 if (decode[i] == 0xFFFE)
4473 /* unmapped character */
4474 continue;
4475 o1 = decode[i]>>11;
4476 o2 = (decode[i]>>7) & 0xF;
4477 i2 = 16*mlevel1[o1] + o2;
4478 if (mlevel2[i2] == 0xFF)
4479 mlevel2[i2] = count3++;
4480 o3 = decode[i] & 0x7F;
4481 i3 = 128*mlevel2[i2] + o3;
4482 mlevel3[i3] = i;
4483 }
4484 return result;
4485}
4486
4487static int
4488encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4489{
4490 struct encoding_map *map = (struct encoding_map*)mapping;
4491 int l1 = c>>11;
4492 int l2 = (c>>7) & 0xF;
4493 int l3 = c & 0x7F;
4494 int i;
4495
4496#ifdef Py_UNICODE_WIDE
4497 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004498 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004499 }
4500#endif
4501 if (c == 0)
4502 return 0;
4503 /* level 1*/
4504 i = map->level1[l1];
4505 if (i == 0xFF) {
4506 return -1;
4507 }
4508 /* level 2*/
4509 i = map->level23[16*i+l2];
4510 if (i == 0xFF) {
4511 return -1;
4512 }
4513 /* level 3 */
4514 i = map->level23[16*map->count2 + 128*i + l3];
4515 if (i == 0) {
4516 return -1;
4517 }
4518 return i;
4519}
4520
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521/* Lookup the character ch in the mapping. If the character
4522 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004523 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 PyObject *w = PyInt_FromLong((long)c);
4527 PyObject *x;
4528
4529 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004530 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 x = PyObject_GetItem(mapping, w);
4532 Py_DECREF(w);
4533 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004534 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4535 /* No mapping found means: mapping is undefined. */
4536 PyErr_Clear();
4537 x = Py_None;
4538 Py_INCREF(x);
4539 return x;
4540 } else
4541 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004543 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004544 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004546 long value = PyInt_AS_LONG(x);
4547 if (value < 0 || value > 255) {
4548 PyErr_SetString(PyExc_TypeError,
4549 "character mapping must be in range(256)");
4550 Py_DECREF(x);
4551 return NULL;
4552 }
4553 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004555 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004556 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004558 /* wrong return value */
4559 PyErr_SetString(PyExc_TypeError,
4560 "character mapping must return integer, None or str");
4561 Py_DECREF(x);
4562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 }
4564}
4565
Martin v. Löwis3f767792006-06-04 19:36:28 +00004566static int
4567charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4568{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004569 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4570 /* exponentially overallocate to minimize reallocations */
4571 if (requiredsize < 2*outsize)
4572 requiredsize = 2*outsize;
4573 if (_PyString_Resize(outobj, requiredsize)) {
4574 return 0;
4575 }
4576 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004577}
4578
Benjamin Peterson857ce152009-01-31 16:29:18 +00004579typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004580 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004581}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582/* lookup the character, put the result in the output string and adjust
4583 various state variables. Reallocate the output string if not enough
4584 space is available. Return a new reference to the object that
4585 was put in the output buffer, or Py_None, if the mapping was undefined
4586 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004587 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004589charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004590 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004592 PyObject *rep;
4593 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004594 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004595
Christian Heimese93237d2007-12-19 02:37:44 +00004596 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004597 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004598 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004599 if (res == -1)
4600 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004601 if (outsize<requiredsize)
4602 if (!charmapencode_resize(outobj, outpos, requiredsize))
4603 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004604 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004605 outstart[(*outpos)++] = (char)res;
4606 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004607 }
4608
4609 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004611 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004612 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004613 Py_DECREF(rep);
4614 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004615 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004616 if (PyInt_Check(rep)) {
4617 Py_ssize_t requiredsize = *outpos+1;
4618 if (outsize<requiredsize)
4619 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4620 Py_DECREF(rep);
4621 return enc_EXCEPTION;
4622 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004623 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004624 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004625 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004626 else {
4627 const char *repchars = PyString_AS_STRING(rep);
4628 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4629 Py_ssize_t requiredsize = *outpos+repsize;
4630 if (outsize<requiredsize)
4631 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4632 Py_DECREF(rep);
4633 return enc_EXCEPTION;
4634 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004635 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004636 memcpy(outstart + *outpos, repchars, repsize);
4637 *outpos += repsize;
4638 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 }
Georg Brandl9f167602006-06-04 21:46:16 +00004640 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004641 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642}
4643
4644/* handle an error in PyUnicode_EncodeCharmap
4645 Return 0 on success, -1 on error */
4646static
4647int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004648 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004650 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004651 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652{
4653 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004654 Py_ssize_t repsize;
4655 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656 Py_UNICODE *uni2;
4657 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004658 Py_ssize_t collstartpos = *inpos;
4659 Py_ssize_t collendpos = *inpos+1;
4660 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661 char *encoding = "charmap";
4662 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004663 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665 /* find all unencodable characters */
4666 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004667 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004668 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004669 int res = encoding_map_lookup(p[collendpos], mapping);
4670 if (res != -1)
4671 break;
4672 ++collendpos;
4673 continue;
4674 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004675
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004676 rep = charmapencode_lookup(p[collendpos], mapping);
4677 if (rep==NULL)
4678 return -1;
4679 else if (rep!=Py_None) {
4680 Py_DECREF(rep);
4681 break;
4682 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004683 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004684 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 }
4686 /* cache callback name lookup
4687 * (if not done yet, i.e. it's the first error) */
4688 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004689 if ((errors==NULL) || (!strcmp(errors, "strict")))
4690 *known_errorHandler = 1;
4691 else if (!strcmp(errors, "replace"))
4692 *known_errorHandler = 2;
4693 else if (!strcmp(errors, "ignore"))
4694 *known_errorHandler = 3;
4695 else if (!strcmp(errors, "xmlcharrefreplace"))
4696 *known_errorHandler = 4;
4697 else
4698 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 }
4700 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004701 case 1: /* strict */
4702 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4703 return -1;
4704 case 2: /* replace */
4705 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004706 x = charmapencode_output('?', mapping, res, respos);
4707 if (x==enc_EXCEPTION) {
4708 return -1;
4709 }
4710 else if (x==enc_FAILED) {
4711 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4712 return -1;
4713 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004714 }
4715 /* fall through */
4716 case 3: /* ignore */
4717 *inpos = collendpos;
4718 break;
4719 case 4: /* xmlcharrefreplace */
Serhiy Storchakae822b032013-08-06 16:56:26 +03004720 /* generate replacement */
4721 for (collpos = collstartpos; collpos < collendpos;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004722 char buffer[2+29+1+1];
4723 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03004724 Py_UCS4 ch = p[collpos++];
4725#ifndef Py_UNICODE_WIDE
4726 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4727 (collpos < collendpos) &&
4728 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4729 ch = ((((ch & 0x03FF) << 10) |
4730 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4731 }
4732#endif
4733 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004734 for (cp = buffer; *cp; ++cp) {
4735 x = charmapencode_output(*cp, mapping, res, respos);
4736 if (x==enc_EXCEPTION)
4737 return -1;
4738 else if (x==enc_FAILED) {
4739 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4740 return -1;
4741 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004742 }
4743 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004744 *inpos = collendpos;
4745 break;
4746 default:
4747 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004748 encoding, reason, p, size, exceptionObject,
4749 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004750 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004751 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004752 /* generate replacement */
4753 repsize = PyUnicode_GET_SIZE(repunicode);
4754 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004755 x = charmapencode_output(*uni2, mapping, res, respos);
4756 if (x==enc_EXCEPTION) {
4757 return -1;
4758 }
4759 else if (x==enc_FAILED) {
4760 Py_DECREF(repunicode);
4761 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4762 return -1;
4763 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004764 }
4765 *inpos = newpos;
4766 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767 }
4768 return 0;
4769}
4770
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004772 Py_ssize_t size,
4773 PyObject *mapping,
4774 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 /* output object */
4777 PyObject *res = NULL;
4778 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004779 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004781 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 PyObject *errorHandler = NULL;
4783 PyObject *exc = NULL;
4784 /* the following variable is used for caching string comparisons
4785 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4786 * 3=ignore, 4=xmlcharrefreplace */
4787 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788
4789 /* Default to Latin-1 */
4790 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004791 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004793 /* allocate enough for a simple encoding without
4794 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004795 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004796 if (res == NULL)
4797 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004798 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004799 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004802 /* try to encode it */
4803 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4804 if (x==enc_EXCEPTION) /* error */
4805 goto onError;
4806 if (x==enc_FAILED) { /* unencodable character */
4807 if (charmap_encoding_error(p, size, &inpos, mapping,
4808 &exc,
4809 &known_errorHandler, &errorHandler, errors,
4810 &res, &respos)) {
4811 goto onError;
4812 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004813 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004814 else
4815 /* done with this character => adjust input position */
4816 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004819 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004820 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004821 if (_PyString_Resize(&res, respos))
4822 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 }
4824 Py_XDECREF(exc);
4825 Py_XDECREF(errorHandler);
4826 return res;
4827
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004828 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829 Py_XDECREF(res);
4830 Py_XDECREF(exc);
4831 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 return NULL;
4833}
4834
4835PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004836 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837{
4838 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004839 PyErr_BadArgument();
4840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 }
4842 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004843 PyUnicode_GET_SIZE(unicode),
4844 mapping,
4845 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846}
4847
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848/* create or adjust a UnicodeTranslateError */
4849static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004850 const Py_UNICODE *unicode, Py_ssize_t size,
4851 Py_ssize_t startpos, Py_ssize_t endpos,
4852 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004854 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004855 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004856 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 }
4858 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004859 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4860 goto onError;
4861 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4862 goto onError;
4863 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4864 goto onError;
4865 return;
4866 onError:
Serhiy Storchaka98a97222014-02-09 13:14:04 +02004867 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 }
4869}
4870
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871/* raises a UnicodeTranslateError */
4872static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004873 const Py_UNICODE *unicode, Py_ssize_t size,
4874 Py_ssize_t startpos, Py_ssize_t endpos,
4875 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876{
4877 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004878 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004880 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004881}
4882
4883/* error handling callback helper:
4884 build arguments, call the callback and check the arguments,
4885 put the result into newpos and return the replacement string, which
4886 has to be freed by the caller */
4887static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004888 PyObject **errorHandler,
4889 const char *reason,
4890 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4891 Py_ssize_t startpos, Py_ssize_t endpos,
4892 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004894 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895
Martin v. Löwis412fb672006-04-13 06:34:32 +00004896 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 PyObject *restuple;
4898 PyObject *resunicode;
4899
4900 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004901 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004903 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904 }
4905
4906 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004907 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004909 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910
4911 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004912 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004913 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004914 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004915 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004916 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004917 Py_DECREF(restuple);
4918 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004919 }
4920 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004921 &resunicode, &i_newpos)) {
4922 Py_DECREF(restuple);
4923 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004924 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004925 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004926 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004927 else
4928 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004929 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004930 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4931 Py_DECREF(restuple);
4932 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004933 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004934 Py_INCREF(resunicode);
4935 Py_DECREF(restuple);
4936 return resunicode;
4937}
4938
4939/* Lookup the character ch in the mapping and put the result in result,
4940 which must be decrefed by the caller.
4941 Return 0 on success, -1 on error */
4942static
4943int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4944{
4945 PyObject *w = PyInt_FromLong((long)c);
4946 PyObject *x;
4947
4948 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004949 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004950 x = PyObject_GetItem(mapping, w);
4951 Py_DECREF(w);
4952 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004953 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4954 /* No mapping found means: use 1:1 mapping. */
4955 PyErr_Clear();
4956 *result = NULL;
4957 return 0;
4958 } else
4959 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004960 }
4961 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004962 *result = x;
4963 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 }
4965 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004966 long value = PyInt_AS_LONG(x);
4967 long max = PyUnicode_GetMax();
4968 if (value < 0 || value > max) {
4969 PyErr_Format(PyExc_TypeError,
4970 "character mapping must be in range(0x%lx)", max+1);
4971 Py_DECREF(x);
4972 return -1;
4973 }
4974 *result = x;
4975 return 0;
4976 }
4977 else if (PyUnicode_Check(x)) {
4978 *result = x;
4979 return 0;
4980 }
4981 else {
4982 /* wrong return value */
4983 PyErr_SetString(PyExc_TypeError,
4984 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004985 Py_DECREF(x);
4986 return -1;
4987 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988}
4989/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004990 if not reallocate and adjust various state variables.
4991 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992static
Walter Dörwald4894c302003-10-24 14:25:28 +00004993int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004994 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004996 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004997 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004998 /* remember old output position */
4999 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5000 /* exponentially overallocate to minimize reallocations */
5001 if (requiredsize < 2 * oldsize)
5002 requiredsize = 2 * oldsize;
5003 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5004 return -1;
5005 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006 }
5007 return 0;
5008}
5009/* lookup the character, put the result in the output string and adjust
5010 various state variables. Return a new reference to the object that
5011 was put in the output buffer in *result, or Py_None, if the mapping was
5012 undefined (in which case no character was written).
5013 The called must decref result.
5014 Return 0 on success, -1 on error. */
5015static
Walter Dörwald4894c302003-10-24 14:25:28 +00005016int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005017 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5018 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005019{
Walter Dörwald4894c302003-10-24 14:25:28 +00005020 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005021 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005023 /* not found => default to 1:1 mapping */
5024 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025 }
5026 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005027 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005029 /* no overflow check, because we know that the space is enough */
5030 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 }
5032 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005033 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5034 if (repsize==1) {
5035 /* no overflow check, because we know that the space is enough */
5036 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5037 }
5038 else if (repsize!=0) {
5039 /* more than one character */
5040 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5041 (insize - (curinp-startinp)) +
5042 repsize - 1;
5043 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5044 return -1;
5045 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5046 *outp += repsize;
5047 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005048 }
5049 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005050 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005051 return 0;
5052}
5053
5054PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005055 Py_ssize_t size,
5056 PyObject *mapping,
5057 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005059 /* output object */
5060 PyObject *res = NULL;
5061 /* pointers to the beginning and end+1 of input */
5062 const Py_UNICODE *startp = p;
5063 const Py_UNICODE *endp = p + size;
5064 /* pointer into the output */
5065 Py_UNICODE *str;
5066 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005067 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005068 char *reason = "character maps to <undefined>";
5069 PyObject *errorHandler = NULL;
5070 PyObject *exc = NULL;
5071 /* the following variable is used for caching string comparisons
5072 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5073 * 3=ignore, 4=xmlcharrefreplace */
5074 int known_errorHandler = -1;
5075
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005077 PyErr_BadArgument();
5078 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005080
5081 /* allocate enough for a simple 1:1 translation without
5082 replacements, if we need more, we'll resize */
5083 res = PyUnicode_FromUnicode(NULL, size);
5084 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005085 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005087 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005088 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005090 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005091 /* try to encode it */
5092 PyObject *x = NULL;
5093 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5094 Py_XDECREF(x);
5095 goto onError;
5096 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005097 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005098 if (x!=Py_None) /* it worked => adjust input pointer */
5099 ++p;
5100 else { /* untranslatable character */
5101 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5102 Py_ssize_t repsize;
5103 Py_ssize_t newpos;
5104 Py_UNICODE *uni2;
5105 /* startpos for collecting untranslatable chars */
5106 const Py_UNICODE *collstart = p;
5107 const Py_UNICODE *collend = p+1;
5108 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005110 /* find all untranslatable characters */
5111 while (collend < endp) {
5112 if (charmaptranslate_lookup(*collend, mapping, &x))
5113 goto onError;
5114 Py_XDECREF(x);
5115 if (x!=Py_None)
5116 break;
5117 ++collend;
5118 }
5119 /* cache callback name lookup
5120 * (if not done yet, i.e. it's the first error) */
5121 if (known_errorHandler==-1) {
5122 if ((errors==NULL) || (!strcmp(errors, "strict")))
5123 known_errorHandler = 1;
5124 else if (!strcmp(errors, "replace"))
5125 known_errorHandler = 2;
5126 else if (!strcmp(errors, "ignore"))
5127 known_errorHandler = 3;
5128 else if (!strcmp(errors, "xmlcharrefreplace"))
5129 known_errorHandler = 4;
5130 else
5131 known_errorHandler = 0;
5132 }
5133 switch (known_errorHandler) {
5134 case 1: /* strict */
5135 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005136 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005137 case 2: /* replace */
5138 /* No need to check for space, this is a 1:1 replacement */
5139 for (coll = collstart; coll<collend; ++coll)
5140 *str++ = '?';
5141 /* fall through */
5142 case 3: /* ignore */
5143 p = collend;
5144 break;
5145 case 4: /* xmlcharrefreplace */
5146 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005147 for (p = collstart; p < collend;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005148 char buffer[2+29+1+1];
5149 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03005150 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5151 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005152 if (charmaptranslate_makespace(&res, &str,
5153 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5154 goto onError;
5155 for (cp = buffer; *cp; ++cp)
5156 *str++ = *cp;
5157 }
5158 p = collend;
5159 break;
5160 default:
5161 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5162 reason, startp, size, &exc,
5163 collstart-startp, collend-startp, &newpos);
5164 if (repunicode == NULL)
5165 goto onError;
5166 /* generate replacement */
5167 repsize = PyUnicode_GET_SIZE(repunicode);
5168 if (charmaptranslate_makespace(&res, &str,
5169 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5170 Py_DECREF(repunicode);
5171 goto onError;
5172 }
5173 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5174 *str++ = *uni2;
5175 p = startp + newpos;
5176 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005177 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005178 }
5179 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005180 /* Resize if we allocated to much */
5181 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005182 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005183 if (PyUnicode_Resize(&res, respos) < 0)
5184 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185 }
5186 Py_XDECREF(exc);
5187 Py_XDECREF(errorHandler);
5188 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005190 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005191 Py_XDECREF(res);
5192 Py_XDECREF(exc);
5193 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 return NULL;
5195}
5196
5197PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005198 PyObject *mapping,
5199 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200{
5201 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005202
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 str = PyUnicode_FromObject(str);
5204 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005205 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005207 PyUnicode_GET_SIZE(str),
5208 mapping,
5209 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 Py_DECREF(str);
5211 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005212
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005213 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 Py_XDECREF(str);
5215 return NULL;
5216}
Tim Petersced69f82003-09-16 20:30:58 +00005217
Guido van Rossum9e896b32000-04-05 20:11:21 +00005218/* --- Decimal Encoder ---------------------------------------------------- */
5219
5220int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005221 Py_ssize_t length,
5222 char *output,
5223 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005224{
5225 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005226 PyObject *errorHandler = NULL;
5227 PyObject *exc = NULL;
5228 const char *encoding = "decimal";
5229 const char *reason = "invalid decimal Unicode string";
5230 /* the following variable is used for caching string comparisons
5231 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5232 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005233
5234 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005235 PyErr_BadArgument();
5236 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005237 }
5238
5239 p = s;
5240 end = s + length;
5241 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005242 register Py_UNICODE ch = *p;
5243 int decimal;
5244 PyObject *repunicode;
5245 Py_ssize_t repsize;
5246 Py_ssize_t newpos;
5247 Py_UNICODE *uni2;
5248 Py_UNICODE *collstart;
5249 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005250
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005251 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005252 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005253 ++p;
5254 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005255 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005256 decimal = Py_UNICODE_TODECIMAL(ch);
5257 if (decimal >= 0) {
5258 *output++ = '0' + decimal;
5259 ++p;
5260 continue;
5261 }
5262 if (0 < ch && ch < 256) {
5263 *output++ = (char)ch;
5264 ++p;
5265 continue;
5266 }
5267 /* All other characters are considered unencodable */
5268 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005269 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005270 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005271 Py_UNICODE_ISSPACE(*collend) ||
5272 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005273 break;
5274 }
5275 /* cache callback name lookup
5276 * (if not done yet, i.e. it's the first error) */
5277 if (known_errorHandler==-1) {
5278 if ((errors==NULL) || (!strcmp(errors, "strict")))
5279 known_errorHandler = 1;
5280 else if (!strcmp(errors, "replace"))
5281 known_errorHandler = 2;
5282 else if (!strcmp(errors, "ignore"))
5283 known_errorHandler = 3;
5284 else if (!strcmp(errors, "xmlcharrefreplace"))
5285 known_errorHandler = 4;
5286 else
5287 known_errorHandler = 0;
5288 }
5289 switch (known_errorHandler) {
5290 case 1: /* strict */
5291 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5292 goto onError;
5293 case 2: /* replace */
5294 for (p = collstart; p < collend; ++p)
5295 *output++ = '?';
5296 /* fall through */
5297 case 3: /* ignore */
5298 p = collend;
5299 break;
5300 case 4: /* xmlcharrefreplace */
5301 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005302 for (p = collstart; p < collend;) {
5303 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5304 output += sprintf(output, "&#%d;", ch);
5305 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005306 p = collend;
5307 break;
5308 default:
5309 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5310 encoding, reason, s, length, &exc,
5311 collstart-s, collend-s, &newpos);
5312 if (repunicode == NULL)
5313 goto onError;
5314 /* generate replacement */
5315 repsize = PyUnicode_GET_SIZE(repunicode);
5316 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5317 Py_UNICODE ch = *uni2;
5318 if (Py_UNICODE_ISSPACE(ch))
5319 *output++ = ' ';
5320 else {
5321 decimal = Py_UNICODE_TODECIMAL(ch);
5322 if (decimal >= 0)
5323 *output++ = '0' + decimal;
5324 else if (0 < ch && ch < 256)
5325 *output++ = (char)ch;
5326 else {
5327 Py_DECREF(repunicode);
5328 raise_encode_exception(&exc, encoding,
5329 s, length, collstart-s, collend-s, reason);
5330 goto onError;
5331 }
5332 }
5333 }
5334 p = s + newpos;
5335 Py_DECREF(repunicode);
5336 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005337 }
5338 /* 0-terminate the output string */
5339 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 Py_XDECREF(exc);
5341 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005342 return 0;
5343
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005344 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005345 Py_XDECREF(exc);
5346 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005347 return -1;
5348}
5349
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350/* --- Helpers ------------------------------------------------------------ */
5351
Eric Smitha9f7d622008-02-17 19:46:49 +00005352#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005353#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005354
5355#include "stringlib/count.h"
5356#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005357#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005358#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005359
Fredrik Lundhc8162812006-05-26 19:33:03 +00005360/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005361#define ADJUST_INDICES(start, end, len) \
5362 if (end > len) \
5363 end = len; \
5364 else if (end < 0) { \
5365 end += len; \
5366 if (end < 0) \
5367 end = 0; \
5368 } \
5369 if (start < 0) { \
5370 start += len; \
5371 if (start < 0) \
5372 start = 0; \
5373 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005374
Martin v. Löwis18e16552006-02-15 17:27:45 +00005375Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005376 PyObject *substr,
5377 Py_ssize_t start,
5378 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005380 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005381 PyUnicodeObject* str_obj;
5382 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005383
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005384 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5385 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005386 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005387 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5388 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005389 Py_DECREF(str_obj);
5390 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 }
Tim Petersced69f82003-09-16 20:30:58 +00005392
Antoine Pitrou64672132010-01-13 07:55:48 +00005393 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005394 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005395 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5396 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005397 );
5398
5399 Py_DECREF(sub_obj);
5400 Py_DECREF(str_obj);
5401
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 return result;
5403}
5404
Martin v. Löwis18e16552006-02-15 17:27:45 +00005405Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005406 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005407 Py_ssize_t start,
5408 Py_ssize_t end,
5409 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005411 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005412
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005413 str = PyUnicode_FromObject(str);
5414 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005415 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005416 sub = PyUnicode_FromObject(sub);
5417 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005418 Py_DECREF(str);
5419 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 }
Tim Petersced69f82003-09-16 20:30:58 +00005421
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005422 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005423 result = stringlib_find_slice(
5424 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5425 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5426 start, end
5427 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005428 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005429 result = stringlib_rfind_slice(
5430 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5431 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5432 start, end
5433 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005434
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005435 Py_DECREF(str);
5436 Py_DECREF(sub);
5437
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 return result;
5439}
5440
Tim Petersced69f82003-09-16 20:30:58 +00005441static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005443 PyUnicodeObject *substring,
5444 Py_ssize_t start,
5445 Py_ssize_t end,
5446 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 if (substring->length == 0)
5449 return 1;
5450
Antoine Pitrou64672132010-01-13 07:55:48 +00005451 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 end -= substring->length;
5453 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005454 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455
5456 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005457 if (Py_UNICODE_MATCH(self, end, substring))
5458 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 } else {
5460 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005461 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 }
5463
5464 return 0;
5465}
5466
Martin v. Löwis18e16552006-02-15 17:27:45 +00005467Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005468 PyObject *substr,
5469 Py_ssize_t start,
5470 Py_ssize_t end,
5471 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005473 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005474
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 str = PyUnicode_FromObject(str);
5476 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005477 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 substr = PyUnicode_FromObject(substr);
5479 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005480 Py_DECREF(str);
5481 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 }
Tim Petersced69f82003-09-16 20:30:58 +00005483
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005485 (PyUnicodeObject *)substr,
5486 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 Py_DECREF(str);
5488 Py_DECREF(substr);
5489 return result;
5490}
5491
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492/* Apply fixfct filter to the Unicode object self and return a
5493 reference to the modified object */
5494
Tim Petersced69f82003-09-16 20:30:58 +00005495static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005497 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498{
5499
5500 PyUnicodeObject *u;
5501
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005502 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005504 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005505
5506 Py_UNICODE_COPY(u->str, self->str, self->length);
5507
Tim Peters7a29bd52001-09-12 03:03:31 +00005508 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005509 /* fixfct should return TRUE if it modified the buffer. If
5510 FALSE, return a reference to the original buffer instead
5511 (to save space, not time) */
5512 Py_INCREF(self);
5513 Py_DECREF(u);
5514 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 }
5516 return (PyObject*) u;
5517}
5518
Tim Petersced69f82003-09-16 20:30:58 +00005519static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520int fixupper(PyUnicodeObject *self)
5521{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005522 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 Py_UNICODE *s = self->str;
5524 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005525
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005527 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005528
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005529 ch = Py_UNICODE_TOUPPER(*s);
5530 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005532 *s = ch;
5533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 s++;
5535 }
5536
5537 return status;
5538}
5539
Tim Petersced69f82003-09-16 20:30:58 +00005540static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541int fixlower(PyUnicodeObject *self)
5542{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005543 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 Py_UNICODE *s = self->str;
5545 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005546
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005548 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005549
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005550 ch = Py_UNICODE_TOLOWER(*s);
5551 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005553 *s = ch;
5554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 s++;
5556 }
5557
5558 return status;
5559}
5560
Tim Petersced69f82003-09-16 20:30:58 +00005561static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562int fixswapcase(PyUnicodeObject *self)
5563{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005564 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 Py_UNICODE *s = self->str;
5566 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005567
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 while (len-- > 0) {
5569 if (Py_UNICODE_ISUPPER(*s)) {
5570 *s = Py_UNICODE_TOLOWER(*s);
5571 status = 1;
5572 } else if (Py_UNICODE_ISLOWER(*s)) {
5573 *s = Py_UNICODE_TOUPPER(*s);
5574 status = 1;
5575 }
5576 s++;
5577 }
5578
5579 return status;
5580}
5581
Tim Petersced69f82003-09-16 20:30:58 +00005582static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583int fixcapitalize(PyUnicodeObject *self)
5584{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005585 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005586 Py_UNICODE *s = self->str;
5587 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005588
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005589 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005590 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005591 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005592 *s = Py_UNICODE_TOUPPER(*s);
5593 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005595 s++;
5596 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005597 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005598 *s = Py_UNICODE_TOLOWER(*s);
5599 status = 1;
5600 }
5601 s++;
5602 }
5603 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604}
5605
5606static
5607int fixtitle(PyUnicodeObject *self)
5608{
5609 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5610 register Py_UNICODE *e;
5611 int previous_is_cased;
5612
5613 /* Shortcut for single character strings */
5614 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005615 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5616 if (*p != ch) {
5617 *p = ch;
5618 return 1;
5619 }
5620 else
5621 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 }
Tim Petersced69f82003-09-16 20:30:58 +00005623
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 e = p + PyUnicode_GET_SIZE(self);
5625 previous_is_cased = 0;
5626 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005627 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005628
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005629 if (previous_is_cased)
5630 *p = Py_UNICODE_TOLOWER(ch);
5631 else
5632 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005633
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005634 if (Py_UNICODE_ISLOWER(ch) ||
5635 Py_UNICODE_ISUPPER(ch) ||
5636 Py_UNICODE_ISTITLE(ch))
5637 previous_is_cased = 1;
5638 else
5639 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 }
5641 return 1;
5642}
5643
Tim Peters8ce9f162004-08-27 01:49:32 +00005644PyObject *
5645PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646{
Tim Peters8ce9f162004-08-27 01:49:32 +00005647 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005648 const Py_UNICODE blank = ' ';
5649 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005650 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005651 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005652 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5653 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005654 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5655 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005656 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005657 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005658 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659
Benjamin Peterson7a91bf82014-02-15 13:02:52 -05005660 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00005661 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005662 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005663 }
5664
Tim Peters91879ab2004-08-27 22:35:44 +00005665 /* Grrrr. A codec may be invoked to convert str objects to
5666 * Unicode, and so it's possible to call back into Python code
5667 * during PyUnicode_FromObject(), and so it's possible for a sick
5668 * codec to change the size of fseq (if seq is a list). Therefore
5669 * we have to keep refetching the size -- can't assume seqlen
5670 * is invariant.
5671 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005672 seqlen = PySequence_Fast_GET_SIZE(fseq);
5673 /* If empty sequence, return u"". */
5674 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005675 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5676 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005677 }
5678 /* If singleton sequence with an exact Unicode, return that. */
5679 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005680 item = PySequence_Fast_GET_ITEM(fseq, 0);
5681 if (PyUnicode_CheckExact(item)) {
5682 Py_INCREF(item);
5683 res = (PyUnicodeObject *)item;
5684 goto Done;
5685 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005686 }
5687
Tim Peters05eba1f2004-08-27 21:32:02 +00005688 /* At least two items to join, or one that isn't exact Unicode. */
5689 if (seqlen > 1) {
5690 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005691 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005692 sep = &blank;
5693 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005694 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005695 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005696 internal_separator = PyUnicode_FromObject(separator);
5697 if (internal_separator == NULL)
5698 goto onError;
5699 sep = PyUnicode_AS_UNICODE(internal_separator);
5700 seplen = PyUnicode_GET_SIZE(internal_separator);
5701 /* In case PyUnicode_FromObject() mutated seq. */
5702 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005703 }
5704 }
5705
5706 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005707 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005708 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005709 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005710 res_p = PyUnicode_AS_UNICODE(res);
5711 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005712
Tim Peters05eba1f2004-08-27 21:32:02 +00005713 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005714 Py_ssize_t itemlen;
5715 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005716
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005717 item = PySequence_Fast_GET_ITEM(fseq, i);
5718 /* Convert item to Unicode. */
5719 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5720 PyErr_Format(PyExc_TypeError,
5721 "sequence item %zd: expected string or Unicode,"
5722 " %.80s found",
5723 i, Py_TYPE(item)->tp_name);
5724 goto onError;
5725 }
5726 item = PyUnicode_FromObject(item);
5727 if (item == NULL)
5728 goto onError;
5729 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005730
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005731 /* In case PyUnicode_FromObject() mutated seq. */
5732 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005733
Tim Peters8ce9f162004-08-27 01:49:32 +00005734 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005735 itemlen = PyUnicode_GET_SIZE(item);
5736 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005737 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005738 goto Overflow;
5739 if (i < seqlen - 1) {
5740 new_res_used += seplen;
5741 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005742 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005743 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005744 if (new_res_used > res_alloc) {
5745 /* double allocated size until it's big enough */
5746 do {
5747 res_alloc += res_alloc;
5748 if (res_alloc <= 0)
5749 goto Overflow;
5750 } while (new_res_used > res_alloc);
5751 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5752 Py_DECREF(item);
5753 goto onError;
5754 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005755 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005756 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005757
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005758 /* Copy item, and maybe the separator. */
5759 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5760 res_p += itemlen;
5761 if (i < seqlen - 1) {
5762 Py_UNICODE_COPY(res_p, sep, seplen);
5763 res_p += seplen;
5764 }
5765 Py_DECREF(item);
5766 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005767 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005768
Tim Peters05eba1f2004-08-27 21:32:02 +00005769 /* Shrink res to match the used area; this probably can't fail,
5770 * but it's cheap to check.
5771 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005772 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005773 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005774
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005775 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005776 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005777 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 return (PyObject *)res;
5779
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005780 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005781 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005782 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005783 Py_DECREF(item);
5784 /* fall through */
5785
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005786 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005787 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005788 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005789 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 return NULL;
5791}
5792
Tim Petersced69f82003-09-16 20:30:58 +00005793static
5794PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005795 Py_ssize_t left,
5796 Py_ssize_t right,
5797 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798{
5799 PyUnicodeObject *u;
5800
5801 if (left < 0)
5802 left = 0;
5803 if (right < 0)
5804 right = 0;
5805
Tim Peters7a29bd52001-09-12 03:03:31 +00005806 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 Py_INCREF(self);
5808 return self;
5809 }
5810
Neal Norwitze7d8be82008-07-31 17:17:14 +00005811 if (left > PY_SSIZE_T_MAX - self->length ||
5812 right > PY_SSIZE_T_MAX - (left + self->length)) {
5813 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5814 return NULL;
5815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816 u = _PyUnicode_New(left + self->length + right);
5817 if (u) {
5818 if (left)
5819 Py_UNICODE_FILL(u->str, fill, left);
5820 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5821 if (right)
5822 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5823 }
5824
5825 return u;
5826}
5827
Antoine Pitrou64672132010-01-13 07:55:48 +00005828PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831
5832 string = PyUnicode_FromObject(string);
5833 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835
Antoine Pitrou64672132010-01-13 07:55:48 +00005836 list = stringlib_splitlines(
5837 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5838 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839
5840 Py_DECREF(string);
5841 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842}
5843
Tim Petersced69f82003-09-16 20:30:58 +00005844static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005846 PyUnicodeObject *substring,
5847 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005850 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005853 return stringlib_split_whitespace(
5854 (PyObject*) self, self->str, self->length, maxcount
5855 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856
Antoine Pitrou64672132010-01-13 07:55:48 +00005857 return stringlib_split(
5858 (PyObject*) self, self->str, self->length,
5859 substring->str, substring->length,
5860 maxcount
5861 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862}
5863
Tim Petersced69f82003-09-16 20:30:58 +00005864static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005865PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005866 PyUnicodeObject *substring,
5867 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005868{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005869 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005870 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005871
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005872 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005873 return stringlib_rsplit_whitespace(
5874 (PyObject*) self, self->str, self->length, maxcount
5875 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005876
Antoine Pitrou64672132010-01-13 07:55:48 +00005877 return stringlib_rsplit(
5878 (PyObject*) self, self->str, self->length,
5879 substring->str, substring->length,
5880 maxcount
5881 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005882}
5883
5884static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005886 PyUnicodeObject *str1,
5887 PyUnicodeObject *str2,
5888 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889{
5890 PyUnicodeObject *u;
5891
5892 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005893 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005894 else if (maxcount == 0 || self->length == 0)
5895 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896
Fredrik Lundh347ee272006-05-24 16:35:18 +00005897 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005898 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005899 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005900 if (str1->length == 0)
5901 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005902 if (str1->length == 1) {
5903 /* replace characters */
5904 Py_UNICODE u1, u2;
5905 if (!findchar(self->str, self->length, str1->str[0]))
5906 goto nothing;
5907 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5908 if (!u)
5909 return NULL;
5910 Py_UNICODE_COPY(u->str, self->str, self->length);
5911 u1 = str1->str[0];
5912 u2 = str2->str[0];
5913 for (i = 0; i < u->length; i++)
5914 if (u->str[i] == u1) {
5915 if (--maxcount < 0)
5916 break;
5917 u->str[i] = u2;
5918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005920 i = stringlib_find(
5921 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005923 if (i < 0)
5924 goto nothing;
5925 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5926 if (!u)
5927 return NULL;
5928 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005929
5930 /* change everything in-place, starting with this one */
5931 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5932 i += str1->length;
5933
5934 while ( --maxcount > 0) {
5935 i = stringlib_find(self->str+i, self->length-i,
5936 str1->str, str1->length,
5937 i);
5938 if (i == -1)
5939 break;
5940 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5941 i += str1->length;
5942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005945
Brett Cannona7f13ee2010-05-04 01:16:51 +00005946 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005947 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 Py_UNICODE *p;
5949
5950 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005951 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5952 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005953 if (n == 0)
5954 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005955 /* new_size = self->length + n * (str2->length - str1->length)); */
5956 delta = (str2->length - str1->length);
5957 if (delta == 0) {
5958 new_size = self->length;
5959 } else {
5960 product = n * (str2->length - str1->length);
5961 if ((product / (str2->length - str1->length)) != n) {
5962 PyErr_SetString(PyExc_OverflowError,
5963 "replace string is too long");
5964 return NULL;
5965 }
5966 new_size = self->length + product;
5967 if (new_size < 0) {
5968 PyErr_SetString(PyExc_OverflowError,
5969 "replace string is too long");
5970 return NULL;
5971 }
5972 }
5973 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005974 if (!u)
5975 return NULL;
5976 i = 0;
5977 p = u->str;
5978 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005979 while (n-- > 0) {
5980 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005981 j = stringlib_find(self->str+i, self->length-i,
5982 str1->str, str1->length,
5983 i);
5984 if (j == -1)
5985 break;
5986 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005987 /* copy unchanged part [i:j] */
5988 Py_UNICODE_COPY(p, self->str+i, j-i);
5989 p += j - i;
5990 }
5991 /* copy substitution string */
5992 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005993 Py_UNICODE_COPY(p, str2->str, str2->length);
5994 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005995 }
5996 i = j + str1->length;
5997 }
5998 if (i < self->length)
5999 /* copy tail [i:] */
6000 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006001 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006002 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006003 while (n > 0) {
6004 Py_UNICODE_COPY(p, str2->str, str2->length);
6005 p += str2->length;
6006 if (--n <= 0)
6007 break;
6008 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006010 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 }
6012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006014
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006015 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006016 /* nothing to replace; return original string (when possible) */
6017 if (PyUnicode_CheckExact(self)) {
6018 Py_INCREF(self);
6019 return (PyObject *) self;
6020 }
6021 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022}
6023
6024/* --- Unicode Object Methods --------------------------------------------- */
6025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006026PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006027 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028\n\
6029Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006030characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031
6032static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006033unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 return fixup(self, fixtitle);
6036}
6037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006038PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006039 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040\n\
6041Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00006042have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043
6044static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006045unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 return fixup(self, fixcapitalize);
6048}
6049
6050#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006051PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006052 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053\n\
6054Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006055normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056
6057static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006058unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059{
6060 PyObject *list;
6061 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006062 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 /* Split into words */
6065 list = split(self, NULL, -1);
6066 if (!list)
6067 return NULL;
6068
6069 /* Capitalize each word */
6070 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6071 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006072 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 if (item == NULL)
6074 goto onError;
6075 Py_DECREF(PyList_GET_ITEM(list, i));
6076 PyList_SET_ITEM(list, i, item);
6077 }
6078
6079 /* Join the words to form a new string */
6080 item = PyUnicode_Join(NULL, list);
6081
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006082 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 Py_DECREF(list);
6084 return (PyObject *)item;
6085}
6086#endif
6087
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006088/* Argument converter. Coerces to a single unicode character */
6089
6090static int
6091convert_uc(PyObject *obj, void *addr)
6092{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006093 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6094 PyObject *uniobj;
6095 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006096
Benjamin Peterson857ce152009-01-31 16:29:18 +00006097 uniobj = PyUnicode_FromObject(obj);
6098 if (uniobj == NULL) {
6099 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006100 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006101 return 0;
6102 }
6103 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6104 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006105 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006106 Py_DECREF(uniobj);
6107 return 0;
6108 }
6109 unistr = PyUnicode_AS_UNICODE(uniobj);
6110 *fillcharloc = unistr[0];
6111 Py_DECREF(uniobj);
6112 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006113}
6114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006115PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006116 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006118Return S centered in a Unicode string of length width. Padding is\n\
6119done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120
6121static PyObject *
6122unicode_center(PyUnicodeObject *self, PyObject *args)
6123{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006124 Py_ssize_t marg, left;
6125 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006126 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127
Thomas Woutersde017742006-02-16 19:34:37 +00006128 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 return NULL;
6130
Tim Peters7a29bd52001-09-12 03:03:31 +00006131 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 Py_INCREF(self);
6133 return (PyObject*) self;
6134 }
6135
6136 marg = width - self->length;
6137 left = marg / 2 + (marg & width & 1);
6138
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006139 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140}
6141
Marc-André Lemburge5034372000-08-08 08:04:29 +00006142#if 0
6143
6144/* This code should go into some future Unicode collation support
6145 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006146 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006147
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006148/* speedy UTF-16 code point order comparison */
6149/* gleaned from: */
6150/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6151
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006152static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006153{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006154 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006155 0, 0, 0, 0, 0, 0, 0, 0,
6156 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006157 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006158};
6159
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160static int
6161unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6162{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006163 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006164
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 Py_UNICODE *s1 = str1->str;
6166 Py_UNICODE *s2 = str2->str;
6167
6168 len1 = str1->length;
6169 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006170
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006172 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006173
6174 c1 = *s1++;
6175 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006176
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006177 if (c1 > (1<<11) * 26)
6178 c1 += utf16Fixup[c1>>11];
6179 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006180 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006181 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006182
6183 if (c1 != c2)
6184 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006185
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006186 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 }
6188
6189 return (len1 < len2) ? -1 : (len1 != len2);
6190}
6191
Marc-André Lemburge5034372000-08-08 08:04:29 +00006192#else
6193
6194static int
6195unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6196{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006197 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006198
6199 Py_UNICODE *s1 = str1->str;
6200 Py_UNICODE *s2 = str2->str;
6201
6202 len1 = str1->length;
6203 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006204
Marc-André Lemburge5034372000-08-08 08:04:29 +00006205 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006206 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006207
Fredrik Lundh45714e92001-06-26 16:39:36 +00006208 c1 = *s1++;
6209 c2 = *s2++;
6210
6211 if (c1 != c2)
6212 return (c1 < c2) ? -1 : 1;
6213
Marc-André Lemburge5034372000-08-08 08:04:29 +00006214 len1--; len2--;
6215 }
6216
6217 return (len1 < len2) ? -1 : (len1 != len2);
6218}
6219
6220#endif
6221
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006223 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224{
6225 PyUnicodeObject *u = NULL, *v = NULL;
6226 int result;
6227
6228 /* Coerce the two arguments */
6229 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6230 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006231 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6233 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006234 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235
Thomas Wouters7e474022000-07-16 12:04:32 +00006236 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006238 Py_DECREF(u);
6239 Py_DECREF(v);
6240 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 }
6242
6243 result = unicode_compare(u, v);
6244
6245 Py_DECREF(u);
6246 Py_DECREF(v);
6247 return result;
6248
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006249 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 Py_XDECREF(u);
6251 Py_XDECREF(v);
6252 return -1;
6253}
6254
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006255PyObject *PyUnicode_RichCompare(PyObject *left,
6256 PyObject *right,
6257 int op)
6258{
6259 int result;
6260
6261 result = PyUnicode_Compare(left, right);
6262 if (result == -1 && PyErr_Occurred())
6263 goto onError;
6264
6265 /* Convert the return value to a Boolean */
6266 switch (op) {
6267 case Py_EQ:
6268 result = (result == 0);
6269 break;
6270 case Py_NE:
6271 result = (result != 0);
6272 break;
6273 case Py_LE:
6274 result = (result <= 0);
6275 break;
6276 case Py_GE:
6277 result = (result >= 0);
6278 break;
6279 case Py_LT:
6280 result = (result == -1);
6281 break;
6282 case Py_GT:
6283 result = (result == 1);
6284 break;
6285 }
6286 return PyBool_FromLong(result);
6287
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006288 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006289
6290 /* Standard case
6291
6292 Type errors mean that PyUnicode_FromObject() could not convert
6293 one of the arguments (usually the right hand side) to Unicode,
6294 ie. we can't handle the comparison request. However, it is
6295 possible that the other object knows a comparison method, which
6296 is why we return Py_NotImplemented to give the other object a
6297 chance.
6298
6299 */
6300 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6301 PyErr_Clear();
6302 Py_INCREF(Py_NotImplemented);
6303 return Py_NotImplemented;
6304 }
6305 if (op != Py_EQ && op != Py_NE)
6306 return NULL;
6307
6308 /* Equality comparison.
6309
6310 This is a special case: we silence any PyExc_UnicodeDecodeError
6311 and instead turn it into a PyErr_UnicodeWarning.
6312
6313 */
6314 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6315 return NULL;
6316 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006317 if (PyErr_Warn(PyExc_UnicodeWarning,
6318 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006319 "Unicode equal comparison "
6320 "failed to convert both arguments to Unicode - "
6321 "interpreting them as being unequal" :
6322 "Unicode unequal comparison "
6323 "failed to convert both arguments to Unicode - "
6324 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006325 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006326 return NULL;
6327 result = (op == Py_NE);
6328 return PyBool_FromLong(result);
6329}
6330
Guido van Rossum403d68b2000-03-13 15:55:09 +00006331int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006332 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006333{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006334 PyObject *str, *sub;
6335 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006336
6337 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006338 sub = PyUnicode_FromObject(element);
6339 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006340 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006341 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006342
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006343 str = PyUnicode_FromObject(container);
6344 if (!str) {
6345 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006346 return -1;
6347 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006348
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006349 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006350
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006351 Py_DECREF(str);
6352 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006353
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006354 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006355}
6356
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357/* Concat to string or Unicode object giving a new Unicode object. */
6358
6359PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006360 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361{
6362 PyUnicodeObject *u = NULL, *v = NULL, *w;
6363
6364 /* Coerce the two arguments */
6365 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6366 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006367 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6369 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006370 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371
6372 /* Shortcuts */
6373 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006374 Py_DECREF(v);
6375 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376 }
6377 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006378 Py_DECREF(u);
6379 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 }
6381
6382 /* Concat the two Unicode strings */
6383 w = _PyUnicode_New(u->length + v->length);
6384 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006385 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 Py_UNICODE_COPY(w->str, u->str, u->length);
6387 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6388
6389 Py_DECREF(u);
6390 Py_DECREF(v);
6391 return (PyObject *)w;
6392
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006393 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 Py_XDECREF(u);
6395 Py_XDECREF(v);
6396 return NULL;
6397}
6398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006399PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006400 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006402Return the number of non-overlapping occurrences of substring sub in\n\
6403Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006404interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
6406static PyObject *
6407unicode_count(PyUnicodeObject *self, PyObject *args)
6408{
6409 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006410 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006411 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 PyObject *result;
6413
Jesus Cea44e81682011-04-20 16:39:15 +02006414 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6415 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006416 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006417
Antoine Pitrou64672132010-01-13 07:55:48 +00006418 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006419 result = PyInt_FromSsize_t(
6420 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006421 substring->str, substring->length,
6422 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006423 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424
6425 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006426
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427 return result;
6428}
6429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006430PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006431 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006433Encodes S using the codec registered for encoding. encoding defaults\n\
6434to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006435handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6437'xmlcharrefreplace' as well as any other name registered with\n\
6438codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439
6440static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006441unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006443 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 char *encoding = NULL;
6445 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006446 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006447
Benjamin Peterson332d7212009-09-18 21:14:55 +00006448 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6449 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006451 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006452 if (v == NULL)
6453 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006454 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006455 PyErr_Format(PyExc_TypeError,
6456 "encoder did not return a string/unicode object "
6457 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006458 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006459 Py_DECREF(v);
6460 return NULL;
6461 }
6462 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006463
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006464 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006465 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006466}
6467
6468PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006469 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006470\n\
6471Decodes S using the codec registered for encoding. encoding defaults\n\
6472to the default encoding. errors may be given to set a different error\n\
6473handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6474a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006475as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006476able to handle UnicodeDecodeErrors.");
6477
6478static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006479unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006480{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006481 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006482 char *encoding = NULL;
6483 char *errors = NULL;
6484 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006485
Benjamin Peterson332d7212009-09-18 21:14:55 +00006486 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6487 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006488 return NULL;
6489 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006490 if (v == NULL)
6491 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006492 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006493 PyErr_Format(PyExc_TypeError,
6494 "decoder did not return a string/unicode object "
6495 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006496 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006497 Py_DECREF(v);
6498 return NULL;
6499 }
6500 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006501
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006502 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504}
6505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006506PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006507 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508\n\
6509Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006510If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
6512static PyObject*
6513unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6514{
6515 Py_UNICODE *e;
6516 Py_UNICODE *p;
6517 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006518 Py_UNICODE *qe;
6519 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520 PyUnicodeObject *u;
6521 int tabsize = 8;
6522
6523 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525
Thomas Wouters7e474022000-07-16 12:04:32 +00006526 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006527 i = 0; /* chars up to and including most recent \n or \r */
6528 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6529 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 for (p = self->str; p < e; p++)
6531 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006532 if (tabsize > 0) {
6533 incr = tabsize - (j % tabsize); /* cannot overflow */
6534 if (j > PY_SSIZE_T_MAX - incr)
6535 goto overflow1;
6536 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006537 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006540 if (j > PY_SSIZE_T_MAX - 1)
6541 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 j++;
6543 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006544 if (i > PY_SSIZE_T_MAX - j)
6545 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006547 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 }
6549 }
6550
Guido van Rossum5bdff602008-03-11 21:18:06 +00006551 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006552 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006553
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 /* Second pass: create output string and fill it */
6555 u = _PyUnicode_New(i + j);
6556 if (!u)
6557 return NULL;
6558
Guido van Rossum5bdff602008-03-11 21:18:06 +00006559 j = 0; /* same as in first pass */
6560 q = u->str; /* next output char */
6561 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562
6563 for (p = self->str; p < e; p++)
6564 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006565 if (tabsize > 0) {
6566 i = tabsize - (j % tabsize);
6567 j += i;
6568 while (i--) {
6569 if (q >= qe)
6570 goto overflow2;
6571 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006572 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006573 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006574 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006575 else {
6576 if (q >= qe)
6577 goto overflow2;
6578 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006579 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 if (*p == '\n' || *p == '\r')
6581 j = 0;
6582 }
6583
6584 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006585
6586 overflow2:
6587 Py_DECREF(u);
6588 overflow1:
6589 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591}
6592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006593PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006594 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595\n\
6596Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006597such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598arguments start and end are interpreted as in slice notation.\n\
6599\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006600Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
6602static PyObject *
6603unicode_find(PyUnicodeObject *self, PyObject *args)
6604{
Jesus Cea44e81682011-04-20 16:39:15 +02006605 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006606 Py_ssize_t start;
6607 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006608 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609
Jesus Cea44e81682011-04-20 16:39:15 +02006610 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6611 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006614 result = stringlib_find_slice(
6615 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6616 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6617 start, end
6618 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619
6620 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006621
6622 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623}
6624
6625static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006626unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627{
6628 if (index < 0 || index >= self->length) {
6629 PyErr_SetString(PyExc_IndexError, "string index out of range");
6630 return NULL;
6631 }
6632
6633 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6634}
6635
6636static long
6637unicode_hash(PyUnicodeObject *self)
6638{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006639 /* Since Unicode objects compare equal to their ASCII string
6640 counterparts, they should use the individual character values
6641 as basis for their hash value. This is needed to assure that
6642 strings and Unicode objects behave in the same way as
6643 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644
Martin v. Löwis18e16552006-02-15 17:27:45 +00006645 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006646 register Py_UNICODE *p;
6647 register long x;
6648
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006649#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006650 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006651#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006653 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006654 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006655 /*
6656 We make the hash of the empty string be 0, rather than using
6657 (prefix ^ suffix), since this slightly obfuscates the hash secret
6658 */
6659 if (len == 0) {
6660 self->hash = 0;
6661 return 0;
6662 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006663 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006664 x = _Py_HashSecret.prefix;
6665 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006666 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006667 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006668 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006669 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006670 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006671 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006672 self->hash = x;
6673 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674}
6675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006676PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006677 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006679Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680
6681static PyObject *
6682unicode_index(PyUnicodeObject *self, PyObject *args)
6683{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006684 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006685 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006686 Py_ssize_t start;
6687 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
Jesus Cea44e81682011-04-20 16:39:15 +02006689 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6690 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006693 result = stringlib_find_slice(
6694 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6695 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6696 start, end
6697 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698
6699 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 if (result < 0) {
6702 PyErr_SetString(PyExc_ValueError, "substring not found");
6703 return NULL;
6704 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006705
Martin v. Löwis18e16552006-02-15 17:27:45 +00006706 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707}
6708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006709PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006710 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006712Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006713at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714
6715static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006716unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717{
6718 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6719 register const Py_UNICODE *e;
6720 int cased;
6721
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 /* Shortcut for single character strings */
6723 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006724 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006726 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006727 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006728 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006729
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 e = p + PyUnicode_GET_SIZE(self);
6731 cased = 0;
6732 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006733 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006734
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006735 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6736 return PyBool_FromLong(0);
6737 else if (!cased && Py_UNICODE_ISLOWER(ch))
6738 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006740 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741}
6742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006743PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006744 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006746Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006747at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748
6749static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006750unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751{
6752 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6753 register const Py_UNICODE *e;
6754 int cased;
6755
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 /* Shortcut for single character strings */
6757 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006758 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006760 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006761 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006762 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006763
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 e = p + PyUnicode_GET_SIZE(self);
6765 cased = 0;
6766 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006767 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006768
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006769 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6770 return PyBool_FromLong(0);
6771 else if (!cased && Py_UNICODE_ISUPPER(ch))
6772 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006774 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775}
6776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006777PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006778 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006780Return True if S is a titlecased string and there is at least one\n\
6781character in S, i.e. upper- and titlecase characters may only\n\
6782follow uncased characters and lowercase characters only cased ones.\n\
6783Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784
6785static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006786unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787{
6788 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6789 register const Py_UNICODE *e;
6790 int cased, previous_is_cased;
6791
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 /* Shortcut for single character strings */
6793 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006794 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6795 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006797 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006798 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006799 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006800
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801 e = p + PyUnicode_GET_SIZE(self);
6802 cased = 0;
6803 previous_is_cased = 0;
6804 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006805 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006806
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006807 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6808 if (previous_is_cased)
6809 return PyBool_FromLong(0);
6810 previous_is_cased = 1;
6811 cased = 1;
6812 }
6813 else if (Py_UNICODE_ISLOWER(ch)) {
6814 if (!previous_is_cased)
6815 return PyBool_FromLong(0);
6816 previous_is_cased = 1;
6817 cased = 1;
6818 }
6819 else
6820 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006825PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006826 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006828Return True if all characters in S are whitespace\n\
6829and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830
6831static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006832unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833{
6834 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6835 register const Py_UNICODE *e;
6836
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 /* Shortcut for single character strings */
6838 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006839 Py_UNICODE_ISSPACE(*p))
6840 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006842 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006843 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006844 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006845
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 e = p + PyUnicode_GET_SIZE(self);
6847 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006848 if (!Py_UNICODE_ISSPACE(*p))
6849 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006851 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852}
6853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006854PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006855 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006856\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006857Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006858and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006859
6860static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006861unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006862{
6863 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6864 register const Py_UNICODE *e;
6865
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006866 /* Shortcut for single character strings */
6867 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006868 Py_UNICODE_ISALPHA(*p))
6869 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006870
6871 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006872 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006873 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006874
6875 e = p + PyUnicode_GET_SIZE(self);
6876 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006877 if (!Py_UNICODE_ISALPHA(*p))
6878 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006879 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006880 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006881}
6882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006883PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006884 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006885\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006886Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006887and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006888
6889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006890unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006891{
6892 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6893 register const Py_UNICODE *e;
6894
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006895 /* Shortcut for single character strings */
6896 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006897 Py_UNICODE_ISALNUM(*p))
6898 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006899
6900 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006901 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006902 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006903
6904 e = p + PyUnicode_GET_SIZE(self);
6905 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006906 if (!Py_UNICODE_ISALNUM(*p))
6907 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006908 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006909 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006910}
6911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006912PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006913 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006915Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006916False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917
6918static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006919unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920{
6921 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6922 register const Py_UNICODE *e;
6923
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 /* Shortcut for single character strings */
6925 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006926 Py_UNICODE_ISDECIMAL(*p))
6927 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006929 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006930 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006931 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006932
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 e = p + PyUnicode_GET_SIZE(self);
6934 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006935 if (!Py_UNICODE_ISDECIMAL(*p))
6936 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939}
6940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006941PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006942 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006944Return True if all characters in S are digits\n\
6945and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
6947static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006948unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949{
6950 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6951 register const Py_UNICODE *e;
6952
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953 /* Shortcut for single character strings */
6954 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006955 Py_UNICODE_ISDIGIT(*p))
6956 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006958 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006959 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006960 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006961
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 e = p + PyUnicode_GET_SIZE(self);
6963 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006964 if (!Py_UNICODE_ISDIGIT(*p))
6965 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006967 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968}
6969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006970PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006971 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006973Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006974False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975
6976static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006977unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978{
6979 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6980 register const Py_UNICODE *e;
6981
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 /* Shortcut for single character strings */
6983 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006984 Py_UNICODE_ISNUMERIC(*p))
6985 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006987 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006988 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006989 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006990
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991 e = p + PyUnicode_GET_SIZE(self);
6992 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006993 if (!Py_UNICODE_ISNUMERIC(*p))
6994 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006996 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997}
6998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006999PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00007000 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001\n\
7002Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00007003iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004
7005static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007006unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007008 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009}
7010
Martin v. Löwis18e16552006-02-15 17:27:45 +00007011static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012unicode_length(PyUnicodeObject *self)
7013{
7014 return self->length;
7015}
7016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007017PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007018 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007020Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007021done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022
7023static PyObject *
7024unicode_ljust(PyUnicodeObject *self, PyObject *args)
7025{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007026 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007027 Py_UNICODE fillchar = ' ';
7028
Martin v. Löwis412fb672006-04-13 06:34:32 +00007029 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 return NULL;
7031
Tim Peters7a29bd52001-09-12 03:03:31 +00007032 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 Py_INCREF(self);
7034 return (PyObject*) self;
7035 }
7036
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007037 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038}
7039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007040PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007041 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007043Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044
7045static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007046unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 return fixup(self, fixlower);
7049}
7050
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007051#define LEFTSTRIP 0
7052#define RIGHTSTRIP 1
7053#define BOTHSTRIP 2
7054
7055/* Arrays indexed by above */
7056static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7057
7058#define STRIPNAME(i) (stripformat[i]+3)
7059
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007060/* externally visible for str.strip(unicode) */
7061PyObject *
7062_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7063{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007064 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7065 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7066 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7067 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7068 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007069
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007070 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007071
Benjamin Peterson857ce152009-01-31 16:29:18 +00007072 i = 0;
7073 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007074 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7075 i++;
7076 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007077 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007078
Benjamin Peterson857ce152009-01-31 16:29:18 +00007079 j = len;
7080 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007081 do {
7082 j--;
7083 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7084 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007085 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086
Benjamin Peterson857ce152009-01-31 16:29:18 +00007087 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007088 Py_INCREF(self);
7089 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007090 }
7091 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007092 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007093}
7094
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095
7096static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007097do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007099 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7100 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007101
Benjamin Peterson857ce152009-01-31 16:29:18 +00007102 i = 0;
7103 if (striptype != RIGHTSTRIP) {
7104 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7105 i++;
7106 }
7107 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007108
Benjamin Peterson857ce152009-01-31 16:29:18 +00007109 j = len;
7110 if (striptype != LEFTSTRIP) {
7111 do {
7112 j--;
7113 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7114 j++;
7115 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007116
Benjamin Peterson857ce152009-01-31 16:29:18 +00007117 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7118 Py_INCREF(self);
7119 return (PyObject*)self;
7120 }
7121 else
7122 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123}
7124
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007125
7126static PyObject *
7127do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7128{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007129 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007130
Benjamin Peterson857ce152009-01-31 16:29:18 +00007131 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7132 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007133
Benjamin Peterson857ce152009-01-31 16:29:18 +00007134 if (sep != NULL && sep != Py_None) {
7135 if (PyUnicode_Check(sep))
7136 return _PyUnicode_XStrip(self, striptype, sep);
7137 else if (PyString_Check(sep)) {
7138 PyObject *res;
7139 sep = PyUnicode_FromObject(sep);
7140 if (sep==NULL)
7141 return NULL;
7142 res = _PyUnicode_XStrip(self, striptype, sep);
7143 Py_DECREF(sep);
7144 return res;
7145 }
7146 else {
7147 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007148 "%s arg must be None, unicode or str",
7149 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007150 return NULL;
7151 }
7152 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007153
Benjamin Peterson857ce152009-01-31 16:29:18 +00007154 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007155}
7156
7157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007158PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007159 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007160\n\
7161Return a copy of the string S with leading and trailing\n\
7162whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007163If chars is given and not None, remove characters in chars instead.\n\
7164If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007165
7166static PyObject *
7167unicode_strip(PyUnicodeObject *self, PyObject *args)
7168{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007169 if (PyTuple_GET_SIZE(args) == 0)
7170 return do_strip(self, BOTHSTRIP); /* Common case */
7171 else
7172 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007173}
7174
7175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007176PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007177 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007178\n\
7179Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007180If chars is given and not None, remove characters in chars instead.\n\
7181If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007182
7183static PyObject *
7184unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7185{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007186 if (PyTuple_GET_SIZE(args) == 0)
7187 return do_strip(self, LEFTSTRIP); /* Common case */
7188 else
7189 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007190}
7191
7192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007193PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007194 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007195\n\
7196Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007197If chars is given and not None, remove characters in chars instead.\n\
7198If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007199
7200static PyObject *
7201unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7202{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007203 if (PyTuple_GET_SIZE(args) == 0)
7204 return do_strip(self, RIGHTSTRIP); /* Common case */
7205 else
7206 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007207}
7208
7209
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007211unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212{
7213 PyUnicodeObject *u;
7214 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007215 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007216 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217
7218 if (len < 0)
7219 len = 0;
7220
Tim Peters7a29bd52001-09-12 03:03:31 +00007221 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 /* no repeat, return original string */
7223 Py_INCREF(str);
7224 return (PyObject*) str;
7225 }
Tim Peters8f422462000-09-09 06:13:41 +00007226
7227 /* ensure # of chars needed doesn't overflow int and # of bytes
7228 * needed doesn't overflow size_t
7229 */
7230 nchars = len * str->length;
7231 if (len && nchars / len != str->length) {
7232 PyErr_SetString(PyExc_OverflowError,
7233 "repeated string is too long");
7234 return NULL;
7235 }
7236 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7237 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7238 PyErr_SetString(PyExc_OverflowError,
7239 "repeated string is too long");
7240 return NULL;
7241 }
7242 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243 if (!u)
7244 return NULL;
7245
7246 p = u->str;
7247
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007248 if (str->length == 1 && len > 0) {
7249 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007250 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007251 Py_ssize_t done = 0; /* number of characters copied this far */
7252 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007253 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007254 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007255 }
7256 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007257 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007258 Py_UNICODE_COPY(p+done, p, n);
7259 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007260 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262
7263 return (PyObject*) u;
7264}
7265
7266PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007267 PyObject *subobj,
7268 PyObject *replobj,
7269 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270{
7271 PyObject *self;
7272 PyObject *str1;
7273 PyObject *str2;
7274 PyObject *result;
7275
7276 self = PyUnicode_FromObject(obj);
7277 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 str1 = PyUnicode_FromObject(subobj);
7280 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007281 Py_DECREF(self);
7282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 }
7284 str2 = PyUnicode_FromObject(replobj);
7285 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007286 Py_DECREF(self);
7287 Py_DECREF(str1);
7288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289 }
Tim Petersced69f82003-09-16 20:30:58 +00007290 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007291 (PyUnicodeObject *)str1,
7292 (PyUnicodeObject *)str2,
7293 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294 Py_DECREF(self);
7295 Py_DECREF(str1);
7296 Py_DECREF(str2);
7297 return result;
7298}
7299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007300PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007301 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302\n\
7303Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007304old replaced by new. If the optional argument count is\n\
7305given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306
7307static PyObject*
7308unicode_replace(PyUnicodeObject *self, PyObject *args)
7309{
7310 PyUnicodeObject *str1;
7311 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007312 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 PyObject *result;
7314
Martin v. Löwis18e16552006-02-15 17:27:45 +00007315 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316 return NULL;
7317 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7318 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007321 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007322 Py_DECREF(str1);
7323 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325
7326 result = replace(self, str1, str2, maxcount);
7327
7328 Py_DECREF(str1);
7329 Py_DECREF(str2);
7330 return result;
7331}
7332
7333static
7334PyObject *unicode_repr(PyObject *unicode)
7335{
7336 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007337 PyUnicode_GET_SIZE(unicode),
7338 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339}
7340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007341PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007342 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343\n\
7344Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007345such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346arguments start and end are interpreted as in slice notation.\n\
7347\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007348Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349
7350static PyObject *
7351unicode_rfind(PyUnicodeObject *self, PyObject *args)
7352{
Jesus Cea44e81682011-04-20 16:39:15 +02007353 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007354 Py_ssize_t start;
7355 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007356 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357
Jesus Cea44e81682011-04-20 16:39:15 +02007358 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7359 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007362 result = stringlib_rfind_slice(
7363 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7364 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7365 start, end
7366 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
7368 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007369
7370 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371}
7372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007373PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007374 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007376Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377
7378static PyObject *
7379unicode_rindex(PyUnicodeObject *self, PyObject *args)
7380{
Jesus Cea44e81682011-04-20 16:39:15 +02007381 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007382 Py_ssize_t start;
7383 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007384 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385
Jesus Cea44e81682011-04-20 16:39:15 +02007386 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7387 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007388 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007390 result = stringlib_rfind_slice(
7391 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7392 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7393 start, end
7394 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395
7396 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007397
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398 if (result < 0) {
7399 PyErr_SetString(PyExc_ValueError, "substring not found");
7400 return NULL;
7401 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007402 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403}
7404
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007405PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007406 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007408Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007409done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410
7411static PyObject *
7412unicode_rjust(PyUnicodeObject *self, PyObject *args)
7413{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007414 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007415 Py_UNICODE fillchar = ' ';
7416
Martin v. Löwis412fb672006-04-13 06:34:32 +00007417 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 return NULL;
7419
Tim Peters7a29bd52001-09-12 03:03:31 +00007420 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421 Py_INCREF(self);
7422 return (PyObject*) self;
7423 }
7424
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007425 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426}
7427
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007429unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430{
7431 /* standard clamping */
7432 if (start < 0)
7433 start = 0;
7434 if (end < 0)
7435 end = 0;
7436 if (end > self->length)
7437 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007438 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 /* full slice, return original string */
7440 Py_INCREF(self);
7441 return (PyObject*) self;
7442 }
7443 if (start > end)
7444 start = end;
7445 /* copy slice */
7446 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007447 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448}
7449
7450PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007451 PyObject *sep,
7452 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453{
7454 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007455
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 s = PyUnicode_FromObject(s);
7457 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007458 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007459 if (sep != NULL) {
7460 sep = PyUnicode_FromObject(sep);
7461 if (sep == NULL) {
7462 Py_DECREF(s);
7463 return NULL;
7464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465 }
7466
7467 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7468
7469 Py_DECREF(s);
7470 Py_XDECREF(sep);
7471 return result;
7472}
7473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007474PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007475 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476\n\
7477Return a list of the words in S, using sep as the\n\
7478delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007479splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007480whitespace string is a separator and empty strings are\n\
7481removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482
7483static PyObject*
7484unicode_split(PyUnicodeObject *self, PyObject *args)
7485{
7486 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007487 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488
Martin v. Löwis18e16552006-02-15 17:27:45 +00007489 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 return NULL;
7491
7492 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007493 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007495 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007497 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498}
7499
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007500PyObject *
7501PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7502{
7503 PyObject* str_obj;
7504 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007505 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007506
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007507 str_obj = PyUnicode_FromObject(str_in);
7508 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007509 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007510 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007511 if (!sep_obj) {
7512 Py_DECREF(str_obj);
7513 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007514 }
7515
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007516 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007517 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7518 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7519 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007520
Fredrik Lundhb9479482006-05-26 17:22:38 +00007521 Py_DECREF(sep_obj);
7522 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007523
7524 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007525}
7526
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007527
7528PyObject *
7529PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7530{
7531 PyObject* str_obj;
7532 PyObject* sep_obj;
7533 PyObject* out;
7534
7535 str_obj = PyUnicode_FromObject(str_in);
7536 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007537 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007538 sep_obj = PyUnicode_FromObject(sep_in);
7539 if (!sep_obj) {
7540 Py_DECREF(str_obj);
7541 return NULL;
7542 }
7543
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007544 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007545 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7546 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7547 );
7548
7549 Py_DECREF(sep_obj);
7550 Py_DECREF(str_obj);
7551
7552 return out;
7553}
7554
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007555PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007556 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007557\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007558Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007559the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007560found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007561
7562static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007563unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007564{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007565 return PyUnicode_Partition((PyObject *)self, separator);
7566}
7567
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007568PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007569 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007570\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007571Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007572the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007573separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007574
7575static PyObject*
7576unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7577{
7578 return PyUnicode_RPartition((PyObject *)self, separator);
7579}
7580
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007581PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007582 PyObject *sep,
7583 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007584{
7585 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007586
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007587 s = PyUnicode_FromObject(s);
7588 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007589 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007590 if (sep != NULL) {
7591 sep = PyUnicode_FromObject(sep);
7592 if (sep == NULL) {
7593 Py_DECREF(s);
7594 return NULL;
7595 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007596 }
7597
7598 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7599
7600 Py_DECREF(s);
7601 Py_XDECREF(sep);
7602 return result;
7603}
7604
7605PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007606 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007607\n\
7608Return a list of the words in S, using sep as the\n\
7609delimiter string, starting at the end of the string and\n\
7610working to the front. If maxsplit is given, at most maxsplit\n\
7611splits are done. If sep is not specified, any whitespace string\n\
7612is a separator.");
7613
7614static PyObject*
7615unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7616{
7617 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007618 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007619
Martin v. Löwis18e16552006-02-15 17:27:45 +00007620 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007621 return NULL;
7622
7623 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007624 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007625 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007626 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007627 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007628 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007629}
7630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007631PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007632 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633\n\
7634Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007635Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007636is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637
7638static PyObject*
7639unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7640{
Guido van Rossum86662912000-04-11 15:38:46 +00007641 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642
Guido van Rossum86662912000-04-11 15:38:46 +00007643 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 return NULL;
7645
Guido van Rossum86662912000-04-11 15:38:46 +00007646 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647}
7648
7649static
7650PyObject *unicode_str(PyUnicodeObject *self)
7651{
Fred Drakee4315f52000-05-09 19:53:39 +00007652 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653}
7654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007655PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007656 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657\n\
7658Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007659and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
7661static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007662unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664 return fixup(self, fixswapcase);
7665}
7666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007667PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007668 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669\n\
7670Return a copy of the string S, where all characters have been mapped\n\
7671through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007672Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7673Unmapped characters are left untouched. Characters mapped to None\n\
7674are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
7676static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007677unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678{
Tim Petersced69f82003-09-16 20:30:58 +00007679 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007680 self->length,
7681 table,
7682 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683}
7684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007685PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007686 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007688Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689
7690static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007691unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 return fixup(self, fixupper);
7694}
7695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007696PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007697 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698\n\
Georg Brandl98064072008-09-09 19:26:00 +00007699Pad a numeric string S with zeros on the left, to fill a field\n\
7700of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701
7702static PyObject *
7703unicode_zfill(PyUnicodeObject *self, PyObject *args)
7704{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007705 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706 PyUnicodeObject *u;
7707
Martin v. Löwis18e16552006-02-15 17:27:45 +00007708 Py_ssize_t width;
7709 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 return NULL;
7711
7712 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007713 if (PyUnicode_CheckExact(self)) {
7714 Py_INCREF(self);
7715 return (PyObject*) self;
7716 }
7717 else
7718 return PyUnicode_FromUnicode(
7719 PyUnicode_AS_UNICODE(self),
7720 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007721 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 }
7723
7724 fill = width - self->length;
7725
7726 u = pad(self, fill, 0, '0');
7727
Walter Dörwald068325e2002-04-15 13:36:47 +00007728 if (u == NULL)
7729 return NULL;
7730
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 if (u->str[fill] == '+' || u->str[fill] == '-') {
7732 /* move sign to beginning of string */
7733 u->str[0] = u->str[fill];
7734 u->str[fill] = '0';
7735 }
7736
7737 return (PyObject*) u;
7738}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739
7740#if 0
7741static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007742free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007744 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745}
7746#endif
7747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007748PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007749 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007751Return True if S starts with the specified prefix, False otherwise.\n\
7752With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007753With optional end, stop comparing S at that position.\n\
7754prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
7756static PyObject *
7757unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007758 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759{
Georg Brandl24250812006-06-09 18:45:48 +00007760 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007762 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007763 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007764 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765
Jesus Cea44e81682011-04-20 16:39:15 +02007766 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007767 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007768 if (PyTuple_Check(subobj)) {
7769 Py_ssize_t i;
7770 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7771 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007772 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007773 if (substring == NULL)
7774 return NULL;
7775 result = tailmatch(self, substring, start, end, -1);
7776 Py_DECREF(substring);
7777 if (result) {
7778 Py_RETURN_TRUE;
7779 }
7780 }
7781 /* nothing matched */
7782 Py_RETURN_FALSE;
7783 }
7784 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007785 if (substring == NULL) {
7786 if (PyErr_ExceptionMatches(PyExc_TypeError))
7787 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7788 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007789 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007790 }
Georg Brandl24250812006-06-09 18:45:48 +00007791 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007793 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794}
7795
7796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007797PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007798 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007800Return True if S ends with the specified suffix, False otherwise.\n\
7801With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007802With optional end, stop comparing S at that position.\n\
7803suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
7805static PyObject *
7806unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007807 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808{
Georg Brandl24250812006-06-09 18:45:48 +00007809 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007811 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007812 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007813 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814
Jesus Cea44e81682011-04-20 16:39:15 +02007815 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007816 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007817 if (PyTuple_Check(subobj)) {
7818 Py_ssize_t i;
7819 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7820 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007821 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007822 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007823 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007824 result = tailmatch(self, substring, start, end, +1);
7825 Py_DECREF(substring);
7826 if (result) {
7827 Py_RETURN_TRUE;
7828 }
7829 }
7830 Py_RETURN_FALSE;
7831 }
7832 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007833 if (substring == NULL) {
7834 if (PyErr_ExceptionMatches(PyExc_TypeError))
7835 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7836 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007837 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007838 }
Georg Brandl24250812006-06-09 18:45:48 +00007839 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007841 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842}
7843
7844
Eric Smitha9f7d622008-02-17 19:46:49 +00007845/* Implements do_string_format, which is unicode because of stringlib */
7846#include "stringlib/string_format.h"
7847
7848PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007849 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007850\n\
Eric Smith6c840852010-11-06 19:43:44 +00007851Return a formatted version of S, using substitutions from args and kwargs.\n\
7852The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007853
Eric Smithdc13b792008-05-30 18:10:04 +00007854static PyObject *
7855unicode__format__(PyObject *self, PyObject *args)
7856{
7857 PyObject *format_spec;
7858 PyObject *result = NULL;
7859 PyObject *tmp = NULL;
7860
7861 /* If 2.x, convert format_spec to the same type as value */
7862 /* This is to allow things like u''.format('') */
7863 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7864 goto done;
7865 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7866 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007867 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007868 goto done;
7869 }
7870 tmp = PyObject_Unicode(format_spec);
7871 if (tmp == NULL)
7872 goto done;
7873 format_spec = tmp;
7874
7875 result = _PyUnicode_FormatAdvanced(self,
7876 PyUnicode_AS_UNICODE(format_spec),
7877 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007878 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007879 Py_XDECREF(tmp);
7880 return result;
7881}
7882
Eric Smitha9f7d622008-02-17 19:46:49 +00007883PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007884 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007885\n\
Eric Smith6c840852010-11-06 19:43:44 +00007886Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007887
Robert Schuppenies901c9972008-06-10 10:10:31 +00007888static PyObject *
7889unicode__sizeof__(PyUnicodeObject *v)
7890{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007891 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7892 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007893}
7894
7895PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007896 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007897\n\
7898");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007899
7900static PyObject *
7901unicode_getnewargs(PyUnicodeObject *v)
7902{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007903 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007904}
7905
7906
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007908 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007909 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7910 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007911 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007912 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7913 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7914 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7915 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7916 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7917 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7918 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007919 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007920 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7921 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7922 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007923 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007924 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007925/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7926 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7927 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7928 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007929 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007930 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007931 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007932 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007933 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7934 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7935 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7936 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7937 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7938 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7939 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7940 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7941 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7942 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7943 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7944 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7945 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7946 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007947 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007948 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7949 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7950 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7951 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007952 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007953#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007954 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955#endif
7956
7957#if 0
7958 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007959 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960#endif
7961
Benjamin Peterson857ce152009-01-31 16:29:18 +00007962 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 {NULL, NULL}
7964};
7965
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007966static PyObject *
7967unicode_mod(PyObject *v, PyObject *w)
7968{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007969 if (!PyUnicode_Check(v)) {
7970 Py_INCREF(Py_NotImplemented);
7971 return Py_NotImplemented;
7972 }
7973 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007974}
7975
7976static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007977 0, /*nb_add*/
7978 0, /*nb_subtract*/
7979 0, /*nb_multiply*/
7980 0, /*nb_divide*/
7981 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007982};
7983
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007985 (lenfunc) unicode_length, /* sq_length */
7986 PyUnicode_Concat, /* sq_concat */
7987 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7988 (ssizeargfunc) unicode_getitem, /* sq_item */
7989 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7990 0, /* sq_ass_item */
7991 0, /* sq_ass_slice */
7992 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993};
7994
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007995static PyObject*
7996unicode_subscript(PyUnicodeObject* self, PyObject* item)
7997{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007998 if (PyIndex_Check(item)) {
7999 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008000 if (i == -1 && PyErr_Occurred())
8001 return NULL;
8002 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008003 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008004 return unicode_getitem(self, i);
8005 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008006 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008007 Py_UNICODE* source_buf;
8008 Py_UNICODE* result_buf;
8009 PyObject* result;
8010
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008011 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008012 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008013 return NULL;
8014 }
8015
8016 if (slicelength <= 0) {
8017 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008018 } else if (start == 0 && step == 1 && slicelength == self->length &&
8019 PyUnicode_CheckExact(self)) {
8020 Py_INCREF(self);
8021 return (PyObject *)self;
8022 } else if (step == 1) {
8023 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008024 } else {
8025 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008026 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8027 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008028
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008029 if (result_buf == NULL)
8030 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008031
8032 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8033 result_buf[i] = source_buf[cur];
8034 }
Tim Petersced69f82003-09-16 20:30:58 +00008035
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008036 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008037 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008038 return result;
8039 }
8040 } else {
8041 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8042 return NULL;
8043 }
8044}
8045
8046static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008047 (lenfunc)unicode_length, /* mp_length */
8048 (binaryfunc)unicode_subscript, /* mp_subscript */
8049 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008050};
8051
Martin v. Löwis18e16552006-02-15 17:27:45 +00008052static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008054 Py_ssize_t index,
8055 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056{
8057 if (index != 0) {
8058 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008059 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 return -1;
8061 }
8062 *ptr = (void *) self->str;
8063 return PyUnicode_GET_DATA_SIZE(self);
8064}
8065
Martin v. Löwis18e16552006-02-15 17:27:45 +00008066static Py_ssize_t
8067unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008068 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069{
8070 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008071 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072 return -1;
8073}
8074
8075static int
8076unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008077 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078{
8079 if (lenp)
8080 *lenp = PyUnicode_GET_DATA_SIZE(self);
8081 return 1;
8082}
8083
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008084static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008086 Py_ssize_t index,
8087 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088{
8089 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008090
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 if (index != 0) {
8092 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008093 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094 return -1;
8095 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008096 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008098 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008099 *ptr = (void *) PyString_AS_STRING(str);
8100 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101}
8102
8103/* Helpers for PyUnicode_Format() */
8104
8105static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008106getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008108 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008110 (*p_argidx)++;
8111 if (arglen < 0)
8112 return args;
8113 else
8114 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 }
8116 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008117 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118 return NULL;
8119}
8120
8121#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008122#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008124#define F_ALT (1<<3)
8125#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
Martin v. Löwis18e16552006-02-15 17:27:45 +00008127static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008128strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008130 register Py_ssize_t i;
8131 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008133 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 return len;
8136}
8137
Neal Norwitzfc76d632006-01-10 06:03:13 +00008138static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008139longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8140{
Tim Peters15231542006-02-16 01:08:01 +00008141 Py_ssize_t result;
8142
Neal Norwitzfc76d632006-01-10 06:03:13 +00008143 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008144 result = strtounicode(buffer, (char *)buffer);
8145 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008146}
8147
Guido van Rossum078151d2002-08-11 04:24:12 +00008148/* XXX To save some code duplication, formatfloat/long/int could have been
8149 shared with stringobject.c, converting from 8-bit to Unicode after the
8150 formatting is done. */
8151
Mark Dickinson18cfada2009-11-23 18:46:41 +00008152/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8153
8154static PyObject *
8155formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008157 char *p;
8158 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008160
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 x = PyFloat_AsDouble(v);
8162 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008163 return NULL;
8164
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008166 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008167
Mark Dickinson18cfada2009-11-23 18:46:41 +00008168 p = PyOS_double_to_string(x, type, prec,
8169 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8170 if (p == NULL)
8171 return NULL;
8172 result = PyUnicode_FromStringAndSize(p, strlen(p));
8173 PyMem_Free(p);
8174 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175}
8176
Tim Peters38fd5b62000-09-21 05:43:11 +00008177static PyObject*
8178formatlong(PyObject *val, int flags, int prec, int type)
8179{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008180 char *buf;
8181 int i, len;
8182 PyObject *str; /* temporary string object. */
8183 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008184
Benjamin Peterson857ce152009-01-31 16:29:18 +00008185 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8186 if (!str)
8187 return NULL;
8188 result = _PyUnicode_New(len);
8189 if (!result) {
8190 Py_DECREF(str);
8191 return NULL;
8192 }
8193 for (i = 0; i < len; i++)
8194 result->str[i] = buf[i];
8195 result->str[len] = 0;
8196 Py_DECREF(str);
8197 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008198}
8199
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200static int
8201formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008202 size_t buflen,
8203 int flags,
8204 int prec,
8205 int type,
8206 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008208 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008209 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8210 * + 1 + 1
8211 * = 24
8212 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008213 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008214 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215 long x;
8216
8217 x = PyInt_AsLong(v);
8218 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008219 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008220 if (x < 0 && type == 'u') {
8221 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008222 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008223 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8224 sign = "-";
8225 else
8226 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008228 prec = 1;
8229
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008230 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8231 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008232 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008233 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008234 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008235 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008236 return -1;
8237 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008238
8239 if ((flags & F_ALT) &&
8240 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008241 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008242 * of issues that cause pain:
8243 * - when 0 is being converted, the C standard leaves off
8244 * the '0x' or '0X', which is inconsistent with other
8245 * %#x/%#X conversions and inconsistent with Python's
8246 * hex() function
8247 * - there are platforms that violate the standard and
8248 * convert 0 with the '0x' or '0X'
8249 * (Metrowerks, Compaq Tru64)
8250 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008251 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008252 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008253 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008254 * We can achieve the desired consistency by inserting our
8255 * own '0x' or '0X' prefix, and substituting %x/%X in place
8256 * of %#x/%#X.
8257 *
8258 * Note that this is the same approach as used in
8259 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008260 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008261 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8262 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008263 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008264 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008265 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8266 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008267 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008268 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008269 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008270 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008271 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008272 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273}
8274
8275static int
8276formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008277 size_t buflen,
8278 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279{
Ezio Melotti32125152010-02-25 17:36:04 +00008280 PyObject *unistr;
8281 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008282 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008283 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008284 if (PyUnicode_GET_SIZE(v) != 1)
8285 goto onError;
8286 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008289 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008290 if (PyString_GET_SIZE(v) != 1)
8291 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008292 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8293 with a UnicodeDecodeError if 'char' is not decodable with the
8294 default encoding (usually ASCII, but it might be something else) */
8295 str = PyString_AS_STRING(v);
8296 if ((unsigned char)str[0] > 0x7F) {
8297 /* the char is not ASCII; try to decode the string using the
8298 default encoding and return -1 to let the UnicodeDecodeError
8299 be raised if the string can't be decoded */
8300 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8301 if (unistr == NULL)
8302 return -1;
8303 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8304 Py_DECREF(unistr);
8305 }
8306 else
8307 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309
8310 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008311 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008313 x = PyInt_AsLong(v);
8314 if (x == -1 && PyErr_Occurred())
8315 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008316#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008317 if (x < 0 || x > 0x10ffff) {
8318 PyErr_SetString(PyExc_OverflowError,
8319 "%c arg not in range(0x110000) "
8320 "(wide Python build)");
8321 return -1;
8322 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008323#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008324 if (x < 0 || x > 0xffff) {
8325 PyErr_SetString(PyExc_OverflowError,
8326 "%c arg not in range(0x10000) "
8327 "(narrow Python build)");
8328 return -1;
8329 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008330#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008331 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332 }
8333 buf[1] = '\0';
8334 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008335
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008336 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008337 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008338 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008339 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340}
8341
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008342/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8343
Mark Dickinson18cfada2009-11-23 18:46:41 +00008344 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008345 chars are formatted. XXX This is a magic number. Each formatting
8346 routine does bounds checking to ensure no overflow, but a better
8347 solution may be to malloc a buffer of appropriate size for each
8348 format. For now, the current solution is sufficient.
8349*/
8350#define FORMATBUFLEN (size_t)120
8351
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008353 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354{
8355 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008356 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357 int args_owned = 0;
8358 PyUnicodeObject *result = NULL;
8359 PyObject *dict = NULL;
8360 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008361
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008363 PyErr_BadInternalCall();
8364 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365 }
8366 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008367 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008368 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369 fmt = PyUnicode_AS_UNICODE(uformat);
8370 fmtcnt = PyUnicode_GET_SIZE(uformat);
8371
8372 reslen = rescnt = fmtcnt + 100;
8373 result = _PyUnicode_New(reslen);
8374 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008375 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376 res = PyUnicode_AS_UNICODE(result);
8377
8378 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008379 arglen = PyTuple_Size(args);
8380 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 }
8382 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008383 arglen = -1;
8384 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385 }
Benjamin Petersonda2c7eb2013-03-23 22:32:00 -05008386 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8387 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008388 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389
8390 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008391 if (*fmt != '%') {
8392 if (--rescnt < 0) {
8393 rescnt = fmtcnt + 100;
8394 reslen += rescnt;
8395 if (_PyUnicode_Resize(&result, reslen) < 0)
8396 goto onError;
8397 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8398 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008399 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008400 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008401 }
8402 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008403 /* Got a format specifier */
8404 int flags = 0;
8405 Py_ssize_t width = -1;
8406 int prec = -1;
8407 Py_UNICODE c = '\0';
8408 Py_UNICODE fill;
8409 int isnumok;
8410 PyObject *v = NULL;
8411 PyObject *temp = NULL;
8412 Py_UNICODE *pbuf;
8413 Py_UNICODE sign;
8414 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008415 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008416
8417 fmt++;
8418 if (*fmt == '(') {
8419 Py_UNICODE *keystart;
8420 Py_ssize_t keylen;
8421 PyObject *key;
8422 int pcount = 1;
8423
8424 if (dict == NULL) {
8425 PyErr_SetString(PyExc_TypeError,
8426 "format requires a mapping");
8427 goto onError;
8428 }
8429 ++fmt;
8430 --fmtcnt;
8431 keystart = fmt;
8432 /* Skip over balanced parentheses */
8433 while (pcount > 0 && --fmtcnt >= 0) {
8434 if (*fmt == ')')
8435 --pcount;
8436 else if (*fmt == '(')
8437 ++pcount;
8438 fmt++;
8439 }
8440 keylen = fmt - keystart - 1;
8441 if (fmtcnt < 0 || pcount > 0) {
8442 PyErr_SetString(PyExc_ValueError,
8443 "incomplete format key");
8444 goto onError;
8445 }
8446#if 0
8447 /* keys are converted to strings using UTF-8 and
8448 then looked up since Python uses strings to hold
8449 variables names etc. in its namespaces and we
8450 wouldn't want to break common idioms. */
8451 key = PyUnicode_EncodeUTF8(keystart,
8452 keylen,
8453 NULL);
8454#else
8455 key = PyUnicode_FromUnicode(keystart, keylen);
8456#endif
8457 if (key == NULL)
8458 goto onError;
8459 if (args_owned) {
8460 Py_DECREF(args);
8461 args_owned = 0;
8462 }
8463 args = PyObject_GetItem(dict, key);
8464 Py_DECREF(key);
8465 if (args == NULL) {
8466 goto onError;
8467 }
8468 args_owned = 1;
8469 arglen = -1;
8470 argidx = -2;
8471 }
8472 while (--fmtcnt >= 0) {
8473 switch (c = *fmt++) {
8474 case '-': flags |= F_LJUST; continue;
8475 case '+': flags |= F_SIGN; continue;
8476 case ' ': flags |= F_BLANK; continue;
8477 case '#': flags |= F_ALT; continue;
8478 case '0': flags |= F_ZERO; continue;
8479 }
8480 break;
8481 }
8482 if (c == '*') {
8483 v = getnextarg(args, arglen, &argidx);
8484 if (v == NULL)
8485 goto onError;
8486 if (!PyInt_Check(v)) {
8487 PyErr_SetString(PyExc_TypeError,
8488 "* wants int");
8489 goto onError;
8490 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008491 width = PyInt_AsSsize_t(v);
8492 if (width == -1 && PyErr_Occurred())
8493 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008494 if (width < 0) {
8495 flags |= F_LJUST;
8496 width = -width;
8497 }
8498 if (--fmtcnt >= 0)
8499 c = *fmt++;
8500 }
8501 else if (c >= '0' && c <= '9') {
8502 width = c - '0';
8503 while (--fmtcnt >= 0) {
8504 c = *fmt++;
8505 if (c < '0' || c > '9')
8506 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008507 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008508 PyErr_SetString(PyExc_ValueError,
8509 "width too big");
8510 goto onError;
8511 }
8512 width = width*10 + (c - '0');
8513 }
8514 }
8515 if (c == '.') {
8516 prec = 0;
8517 if (--fmtcnt >= 0)
8518 c = *fmt++;
8519 if (c == '*') {
8520 v = getnextarg(args, arglen, &argidx);
8521 if (v == NULL)
8522 goto onError;
8523 if (!PyInt_Check(v)) {
8524 PyErr_SetString(PyExc_TypeError,
8525 "* wants int");
8526 goto onError;
8527 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008528 prec = _PyInt_AsInt(v);
8529 if (prec == -1 && PyErr_Occurred())
8530 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008531 if (prec < 0)
8532 prec = 0;
8533 if (--fmtcnt >= 0)
8534 c = *fmt++;
8535 }
8536 else if (c >= '0' && c <= '9') {
8537 prec = c - '0';
8538 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008539 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008540 if (c < '0' || c > '9')
8541 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008542 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008543 PyErr_SetString(PyExc_ValueError,
8544 "prec too big");
8545 goto onError;
8546 }
8547 prec = prec*10 + (c - '0');
8548 }
8549 }
8550 } /* prec */
8551 if (fmtcnt >= 0) {
8552 if (c == 'h' || c == 'l' || c == 'L') {
8553 if (--fmtcnt >= 0)
8554 c = *fmt++;
8555 }
8556 }
8557 if (fmtcnt < 0) {
8558 PyErr_SetString(PyExc_ValueError,
8559 "incomplete format");
8560 goto onError;
8561 }
8562 if (c != '%') {
8563 v = getnextarg(args, arglen, &argidx);
8564 if (v == NULL)
8565 goto onError;
8566 }
8567 sign = 0;
8568 fill = ' ';
8569 switch (c) {
8570
8571 case '%':
8572 pbuf = formatbuf;
8573 /* presume that buffer length is at least 1 */
8574 pbuf[0] = '%';
8575 len = 1;
8576 break;
8577
8578 case 's':
8579 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008580 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008581 temp = v;
8582 Py_INCREF(temp);
8583 }
8584 else {
8585 PyObject *unicode;
8586 if (c == 's')
8587 temp = PyObject_Unicode(v);
8588 else
8589 temp = PyObject_Repr(v);
8590 if (temp == NULL)
8591 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008592 if (PyUnicode_Check(temp))
8593 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008594 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008595 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008596 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8597 PyString_GET_SIZE(temp),
8598 NULL,
8599 "strict");
8600 Py_DECREF(temp);
8601 temp = unicode;
8602 if (temp == NULL)
8603 goto onError;
8604 }
8605 else {
8606 Py_DECREF(temp);
8607 PyErr_SetString(PyExc_TypeError,
8608 "%s argument has non-string str()");
8609 goto onError;
8610 }
8611 }
8612 pbuf = PyUnicode_AS_UNICODE(temp);
8613 len = PyUnicode_GET_SIZE(temp);
8614 if (prec >= 0 && len > prec)
8615 len = prec;
8616 break;
8617
8618 case 'i':
8619 case 'd':
8620 case 'u':
8621 case 'o':
8622 case 'x':
8623 case 'X':
8624 if (c == 'i')
8625 c = 'd';
8626 isnumok = 0;
8627 if (PyNumber_Check(v)) {
8628 PyObject *iobj=NULL;
8629
8630 if (PyInt_Check(v) || (PyLong_Check(v))) {
8631 iobj = v;
8632 Py_INCREF(iobj);
8633 }
8634 else {
8635 iobj = PyNumber_Int(v);
8636 if (iobj==NULL) iobj = PyNumber_Long(v);
8637 }
8638 if (iobj!=NULL) {
8639 if (PyInt_Check(iobj)) {
8640 isnumok = 1;
8641 pbuf = formatbuf;
8642 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8643 flags, prec, c, iobj);
8644 Py_DECREF(iobj);
8645 if (len < 0)
8646 goto onError;
8647 sign = 1;
8648 }
8649 else if (PyLong_Check(iobj)) {
8650 isnumok = 1;
8651 temp = formatlong(iobj, flags, prec, c);
8652 Py_DECREF(iobj);
8653 if (!temp)
8654 goto onError;
8655 pbuf = PyUnicode_AS_UNICODE(temp);
8656 len = PyUnicode_GET_SIZE(temp);
8657 sign = 1;
8658 }
8659 else {
8660 Py_DECREF(iobj);
8661 }
8662 }
8663 }
8664 if (!isnumok) {
8665 PyErr_Format(PyExc_TypeError,
8666 "%%%c format: a number is required, "
8667 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8668 goto onError;
8669 }
8670 if (flags & F_ZERO)
8671 fill = '0';
8672 break;
8673
8674 case 'e':
8675 case 'E':
8676 case 'f':
8677 case 'F':
8678 case 'g':
8679 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008680 temp = formatfloat(v, flags, prec, c);
8681 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008682 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008683 pbuf = PyUnicode_AS_UNICODE(temp);
8684 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008685 sign = 1;
8686 if (flags & F_ZERO)
8687 fill = '0';
8688 break;
8689
8690 case 'c':
8691 pbuf = formatbuf;
8692 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8693 if (len < 0)
8694 goto onError;
8695 break;
8696
8697 default:
8698 PyErr_Format(PyExc_ValueError,
8699 "unsupported format character '%c' (0x%x) "
8700 "at index %zd",
8701 (31<=c && c<=126) ? (char)c : '?',
8702 (int)c,
8703 (Py_ssize_t)(fmt - 1 -
8704 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008705 goto onError;
8706 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008707 if (sign) {
8708 if (*pbuf == '-' || *pbuf == '+') {
8709 sign = *pbuf++;
8710 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008711 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008712 else if (flags & F_SIGN)
8713 sign = '+';
8714 else if (flags & F_BLANK)
8715 sign = ' ';
8716 else
8717 sign = 0;
8718 }
8719 if (width < len)
8720 width = len;
8721 if (rescnt - (sign != 0) < width) {
8722 reslen -= rescnt;
8723 rescnt = width + fmtcnt + 100;
8724 reslen += rescnt;
8725 if (reslen < 0) {
8726 Py_XDECREF(temp);
8727 PyErr_NoMemory();
8728 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008729 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008730 if (_PyUnicode_Resize(&result, reslen) < 0) {
8731 Py_XDECREF(temp);
8732 goto onError;
8733 }
8734 res = PyUnicode_AS_UNICODE(result)
8735 + reslen - rescnt;
8736 }
8737 if (sign) {
8738 if (fill != ' ')
8739 *res++ = sign;
8740 rescnt--;
8741 if (width > len)
8742 width--;
8743 }
8744 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8745 assert(pbuf[0] == '0');
8746 assert(pbuf[1] == c);
8747 if (fill != ' ') {
8748 *res++ = *pbuf++;
8749 *res++ = *pbuf++;
8750 }
8751 rescnt -= 2;
8752 width -= 2;
8753 if (width < 0)
8754 width = 0;
8755 len -= 2;
8756 }
8757 if (width > len && !(flags & F_LJUST)) {
8758 do {
8759 --rescnt;
8760 *res++ = fill;
8761 } while (--width > len);
8762 }
8763 if (fill == ' ') {
8764 if (sign)
8765 *res++ = sign;
8766 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8767 assert(pbuf[0] == '0');
8768 assert(pbuf[1] == c);
8769 *res++ = *pbuf++;
8770 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008771 }
8772 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008773 Py_UNICODE_COPY(res, pbuf, len);
8774 res += len;
8775 rescnt -= len;
8776 while (--width >= len) {
8777 --rescnt;
8778 *res++ = ' ';
8779 }
8780 if (dict && (argidx < arglen) && c != '%') {
8781 PyErr_SetString(PyExc_TypeError,
8782 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008783 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008784 goto onError;
8785 }
8786 Py_XDECREF(temp);
8787 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788 } /* until end */
8789 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008790 PyErr_SetString(PyExc_TypeError,
8791 "not all arguments converted during string formatting");
8792 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 }
8794
Thomas Woutersa96affe2006-03-12 00:29:36 +00008795 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008796 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008798 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 }
8800 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 return (PyObject *)result;
8802
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008803 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804 Py_XDECREF(result);
8805 Py_DECREF(uformat);
8806 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008807 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808 }
8809 return NULL;
8810}
8811
8812static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008813 (readbufferproc) unicode_buffer_getreadbuf,
8814 (writebufferproc) unicode_buffer_getwritebuf,
8815 (segcountproc) unicode_buffer_getsegcount,
8816 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817};
8818
Jeremy Hylton938ace62002-07-17 16:30:39 +00008819static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008820unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8821
Tim Peters6d6c1a32001-08-02 04:15:00 +00008822static PyObject *
8823unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8824{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008825 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008826 static char *kwlist[] = {"string", "encoding", "errors", 0};
8827 char *encoding = NULL;
8828 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008829
Benjamin Peterson857ce152009-01-31 16:29:18 +00008830 if (type != &PyUnicode_Type)
8831 return unicode_subtype_new(type, args, kwds);
8832 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008833 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008834 return NULL;
8835 if (x == NULL)
8836 return (PyObject *)_PyUnicode_New(0);
8837 if (encoding == NULL && errors == NULL)
8838 return PyObject_Unicode(x);
8839 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008840 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008841}
8842
Guido van Rossume023fe02001-08-30 03:12:59 +00008843static PyObject *
8844unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8845{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008846 PyUnicodeObject *tmp, *pnew;
8847 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008848
Benjamin Peterson857ce152009-01-31 16:29:18 +00008849 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8850 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8851 if (tmp == NULL)
8852 return NULL;
8853 assert(PyUnicode_Check(tmp));
8854 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8855 if (pnew == NULL) {
8856 Py_DECREF(tmp);
8857 return NULL;
8858 }
8859 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8860 if (pnew->str == NULL) {
8861 _Py_ForgetReference((PyObject *)pnew);
8862 PyObject_Del(pnew);
8863 Py_DECREF(tmp);
8864 return PyErr_NoMemory();
8865 }
8866 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8867 pnew->length = n;
8868 pnew->hash = tmp->hash;
8869 Py_DECREF(tmp);
8870 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008871}
8872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008873PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008874 "unicode(object='') -> unicode object\n\
8875unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008876\n\
8877Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008878encoding defaults to the current default string encoding.\n\
8879errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008880
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008882 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008883 "unicode", /* tp_name */
8884 sizeof(PyUnicodeObject), /* tp_size */
8885 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008887 (destructor)unicode_dealloc, /* tp_dealloc */
8888 0, /* tp_print */
8889 0, /* tp_getattr */
8890 0, /* tp_setattr */
8891 0, /* tp_compare */
8892 unicode_repr, /* tp_repr */
8893 &unicode_as_number, /* tp_as_number */
8894 &unicode_as_sequence, /* tp_as_sequence */
8895 &unicode_as_mapping, /* tp_as_mapping */
8896 (hashfunc) unicode_hash, /* tp_hash*/
8897 0, /* tp_call*/
8898 (reprfunc) unicode_str, /* tp_str */
8899 PyObject_GenericGetAttr, /* tp_getattro */
8900 0, /* tp_setattro */
8901 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008902 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008903 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008904 unicode_doc, /* tp_doc */
8905 0, /* tp_traverse */
8906 0, /* tp_clear */
8907 PyUnicode_RichCompare, /* tp_richcompare */
8908 0, /* tp_weaklistoffset */
8909 0, /* tp_iter */
8910 0, /* tp_iternext */
8911 unicode_methods, /* tp_methods */
8912 0, /* tp_members */
8913 0, /* tp_getset */
8914 &PyBaseString_Type, /* tp_base */
8915 0, /* tp_dict */
8916 0, /* tp_descr_get */
8917 0, /* tp_descr_set */
8918 0, /* tp_dictoffset */
8919 0, /* tp_init */
8920 0, /* tp_alloc */
8921 unicode_new, /* tp_new */
8922 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923};
8924
8925/* Initialize the Unicode implementation */
8926
Thomas Wouters78890102000-07-22 19:25:51 +00008927void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008929 /* XXX - move this array to unicodectype.c ? */
8930 Py_UNICODE linebreak[] = {
8931 0x000A, /* LINE FEED */
8932 0x000D, /* CARRIAGE RETURN */
8933 0x001C, /* FILE SEPARATOR */
8934 0x001D, /* GROUP SEPARATOR */
8935 0x001E, /* RECORD SEPARATOR */
8936 0x0085, /* NEXT LINE */
8937 0x2028, /* LINE SEPARATOR */
8938 0x2029, /* PARAGRAPH SEPARATOR */
8939 };
8940
Fred Drakee4315f52000-05-09 19:53:39 +00008941 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008942 if (!unicode_empty) {
8943 unicode_empty = _PyUnicode_New(0);
8944 if (!unicode_empty)
8945 return;
8946 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008947
Guido van Rossumcacfc072002-05-24 19:01:59 +00008948 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008949 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008950
8951 /* initialize the linebreak bloom filter */
8952 bloom_linebreak = make_bloom_mask(
8953 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8954 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008955
8956 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008957
8958 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8959 Py_FatalError("Can't initialize field name iterator type");
8960
8961 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8962 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963}
8964
8965/* Finalize the Unicode implementation */
8966
Christian Heimes3b718a72008-02-14 12:47:33 +00008967int
8968PyUnicode_ClearFreeList(void)
8969{
8970 int freelist_size = numfree;
8971 PyUnicodeObject *u;
8972
8973 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008974 PyUnicodeObject *v = u;
8975 u = *(PyUnicodeObject **)u;
8976 if (v->str)
8977 PyObject_DEL(v->str);
8978 Py_XDECREF(v->defenc);
8979 PyObject_Del(v);
8980 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008981 }
8982 free_list = NULL;
8983 assert(numfree == 0);
8984 return freelist_size;
8985}
8986
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987void
Thomas Wouters78890102000-07-22 19:25:51 +00008988_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008990 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008992 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008993
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008994 for (i = 0; i < 256; i++)
8995 Py_CLEAR(unicode_latin1[i]);
8996
Christian Heimes3b718a72008-02-14 12:47:33 +00008997 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008999
Anthony Baxterac6bd462006-04-13 02:06:09 +00009000#ifdef __cplusplus
9001}
9002#endif