blob: 4e05490431fabb1b82698ea1aa3c04ae58090aa4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200114#define _PyUnicode_WSTR(op) \
115 (((PyASCIIObject*)(op))->wstr)
116#define _PyUnicode_WSTR_LENGTH(op) \
117 (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) \
119 (((PyASCIIObject *)(op))->length)
120#define _PyUnicode_STATE(op) \
121 (((PyASCIIObject *)(op))->state)
122#define _PyUnicode_HASH(op) \
123 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_KIND(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_GET_LENGTH(op) \
128 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_DATA_ANY(op) \
131 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132
Victor Stinner910337b2011-10-03 03:20:16 +0200133#undef PyUnicode_READY
134#define PyUnicode_READY(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200137 0 : \
138 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200139
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200140#define _PyUnicode_READY_REPLACE(p_obj) \
141 (assert(_PyUnicode_CHECK(*p_obj)), \
142 (PyUnicode_IS_READY(*p_obj) ? \
143 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
156 (assert(_PyUnicode_CHECK(op)), \
157 (!PyUnicode_IS_COMPACT_ASCII(op) \
158 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
Victor Stinner03490912011-10-03 23:45:12 +0200161/* true if the Unicode object has an allocated wstr memory block
162 (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op) \
164 (assert(_PyUnicode_CHECK(op)), \
165 (_PyUnicode_WSTR(op) && \
166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
176 const from_type *iter_; to_type *to_; \
177 for (iter_ = (begin), to_ = (to_type *)(to); \
178 iter_ < (end); \
179 ++iter_, ++to_) { \
180 *to_ = (to_type)*iter_; \
181 } \
182 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200183
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200184/* The Unicode string has been modified: reset the hash */
185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
186
Walter Dörwald16807132007-05-25 13:52:07 +0000187/* This dictionary holds all interned unicode strings. Note that references
188 to strings in this dictionary are *not* counted in the string's ob_refcnt.
189 When the interned string reaches a refcnt of 0 the string deallocation
190 function will delete the reference from this dictionary.
191
192 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000193 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000194*/
195static PyObject *interned;
196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200198static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
200/* Single character Unicode strings in the Latin-1 range are being
201 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
Christian Heimes190d79e2008-01-30 11:58:22 +0000204/* Fast detection of the most frequent whitespace characters */
205const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000206 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000208/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000C: * FORM FEED */
211/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 1, 1, 1, 1, 1, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x001C: * FILE SEPARATOR */
215/* case 0x001D: * GROUP SEPARATOR */
216/* case 0x001E: * RECORD SEPARATOR */
217/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 1, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000224
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000233};
234
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200235/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
242 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
248 const Py_UNICODE *unicode, Py_ssize_t size,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
295static int
296_PyUnicode_CheckConsistency(void *op)
297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
322 } else {
323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
330 assert(ascii->wstr != NULL);
331 assert(data == NULL);
332 assert(compact->utf8 == NULL);
333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
370 return 1;
371}
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400372#else
373static int
374_PyUnicode_CheckConsistency(void *op)
375{
376 return 1;
377}
Victor Stinner910337b2011-10-03 03:20:16 +0200378#endif
379
Thomas Wouters477c8d52006-05-27 19:21:47 +0000380/* --- Bloom Filters ----------------------------------------------------- */
381
382/* stuff to implement simple "bloom filters" for Unicode characters.
383 to keep things simple, we use a single bitmask, using the least 5
384 bits from each unicode characters as the bit index. */
385
386/* the linebreak mask is set up by Unicode_Init below */
387
Antoine Pitrouf068f942010-01-13 14:19:12 +0000388#if LONG_BIT >= 128
389#define BLOOM_WIDTH 128
390#elif LONG_BIT >= 64
391#define BLOOM_WIDTH 64
392#elif LONG_BIT >= 32
393#define BLOOM_WIDTH 32
394#else
395#error "LONG_BIT is smaller than 32"
396#endif
397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398#define BLOOM_MASK unsigned long
399
400static BLOOM_MASK bloom_linebreak;
401
Antoine Pitrouf068f942010-01-13 14:19:12 +0000402#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
403#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405#define BLOOM_LINEBREAK(ch) \
406 ((ch) < 128U ? ascii_linebreak[(ch)] : \
407 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000408
Alexander Belopolsky40018472011-02-26 01:02:56 +0000409Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411{
412 /* calculate simple bloom-style bitmask for a given unicode string */
413
Antoine Pitrouf068f942010-01-13 14:19:12 +0000414 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000415 Py_ssize_t i;
416
417 mask = 0;
418 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000420
421 return mask;
422}
423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200424#define BLOOM_MEMBER(mask, chr, str) \
425 (BLOOM(mask, chr) \
426 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000427
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428/* --- Unicode Object ----------------------------------------------------- */
429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200430static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
432
433Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
434 Py_ssize_t size, Py_UCS4 ch,
435 int direction)
436{
437 /* like wcschr, but doesn't stop at NULL characters */
438 Py_ssize_t i;
439 if (direction == 1) {
440 for(i = 0; i < size; i++)
441 if (PyUnicode_READ(kind, s, i) == ch)
442 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
443 }
444 else {
445 for(i = size-1; i >= 0; i--)
446 if (PyUnicode_READ(kind, s, i) == ch)
447 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
448 }
449 return NULL;
450}
451
Victor Stinnerfe226c02011-10-03 03:52:20 +0200452static PyObject*
453resize_compact(PyObject *unicode, Py_ssize_t length)
454{
455 Py_ssize_t char_size;
456 Py_ssize_t struct_size;
457 Py_ssize_t new_size;
458 int share_wstr;
459
460 assert(PyUnicode_IS_READY(unicode));
461 char_size = PyUnicode_CHARACTER_SIZE(unicode);
462 if (PyUnicode_IS_COMPACT_ASCII(unicode))
463 struct_size = sizeof(PyASCIIObject);
464 else
465 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200466 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200467
468 _Py_DEC_REFTOTAL;
469 _Py_ForgetReference(unicode);
470
471 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
472 PyErr_NoMemory();
473 return NULL;
474 }
475 new_size = (struct_size + (length + 1) * char_size);
476
477 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
478 if (unicode == NULL) {
479 PyObject_Del(unicode);
480 PyErr_NoMemory();
481 return NULL;
482 }
483 _Py_NewReference(unicode);
484 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200485 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200486 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200487 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
488 _PyUnicode_WSTR_LENGTH(unicode) = length;
489 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
491 length, 0);
492 return unicode;
493}
494
Alexander Belopolsky40018472011-02-26 01:02:56 +0000495static int
Victor Stinner95663112011-10-04 01:03:50 +0200496resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497{
Victor Stinner95663112011-10-04 01:03:50 +0200498 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200499 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200500 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000501
Victor Stinner95663112011-10-04 01:03:50 +0200502 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200503
504 if (PyUnicode_IS_READY(unicode)) {
505 Py_ssize_t char_size;
506 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200507 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508 void *data;
509
510 data = _PyUnicode_DATA_ANY(unicode);
511 assert(data != NULL);
512 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200513 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
514 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200515 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
516 {
517 PyObject_DEL(_PyUnicode_UTF8(unicode));
518 _PyUnicode_UTF8(unicode) = NULL;
519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521
522 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
523 PyErr_NoMemory();
524 return -1;
525 }
526 new_size = (length + 1) * char_size;
527
528 data = (PyObject *)PyObject_REALLOC(data, new_size);
529 if (data == NULL) {
530 PyErr_NoMemory();
531 return -1;
532 }
533 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200534 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200535 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200536 _PyUnicode_WSTR_LENGTH(unicode) = length;
537 }
538 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200539 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200540 _PyUnicode_UTF8_LENGTH(unicode) = length;
541 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200542 _PyUnicode_LENGTH(unicode) = length;
543 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200544 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400545 _PyUnicode_CheckConsistency(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200546 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200547 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200548 }
Victor Stinner95663112011-10-04 01:03:50 +0200549 assert(_PyUnicode_WSTR(unicode) != NULL);
550
551 /* check for integer overflow */
552 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
553 PyErr_NoMemory();
554 return -1;
555 }
556 wstr = _PyUnicode_WSTR(unicode);
557 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
558 if (!wstr) {
559 PyErr_NoMemory();
560 return -1;
561 }
562 _PyUnicode_WSTR(unicode) = wstr;
563 _PyUnicode_WSTR(unicode)[length] = 0;
564 _PyUnicode_WSTR_LENGTH(unicode) = length;
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400565 _PyUnicode_CheckConsistency(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 return 0;
567}
568
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569static PyObject*
570resize_copy(PyObject *unicode, Py_ssize_t length)
571{
572 Py_ssize_t copy_length;
573 if (PyUnicode_IS_COMPACT(unicode)) {
574 PyObject *copy;
575 assert(PyUnicode_IS_READY(unicode));
576
577 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
578 if (copy == NULL)
579 return NULL;
580
581 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
582 if (PyUnicode_CopyCharacters(copy, 0,
583 unicode, 0,
584 copy_length) < 0)
585 {
586 Py_DECREF(copy);
587 return NULL;
588 }
589 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200590 }
591 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200592 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200593 assert(_PyUnicode_WSTR(unicode) != NULL);
594 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200595 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200596 if (w == NULL)
597 return NULL;
598 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
599 copy_length = Py_MIN(copy_length, length);
600 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
601 copy_length);
602 return (PyObject*)w;
603 }
604}
605
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000607 Ux0000 terminated; some code (e.g. new_identifier)
608 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609
610 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000611 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612
613*/
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615#ifdef Py_DEBUG
616int unicode_old_new_calls = 0;
617#endif
618
Alexander Belopolsky40018472011-02-26 01:02:56 +0000619static PyUnicodeObject *
620_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621{
622 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624
Thomas Wouters477c8d52006-05-27 19:21:47 +0000625 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 if (length == 0 && unicode_empty != NULL) {
627 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200628 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 }
630
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000631 /* Ensure we won't overflow the size. */
632 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
633 return (PyUnicodeObject *)PyErr_NoMemory();
634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635 if (length < 0) {
636 PyErr_SetString(PyExc_SystemError,
637 "Negative size passed to _PyUnicode_New");
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641#ifdef Py_DEBUG
642 ++unicode_old_new_calls;
643#endif
644
645 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
646 if (unicode == NULL)
647 return NULL;
648 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
649 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
650 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000651 PyErr_NoMemory();
652 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654
Jeremy Hyltond8082792003-09-16 19:41:39 +0000655 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000656 * the caller fails before initializing str -- unicode_resize()
657 * reads str[0], and the Keep-Alive optimization can keep memory
658 * allocated for str alive across a call to unicode_dealloc(unicode).
659 * We don't want unicode_resize to read uninitialized memory in
660 * that case.
661 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200662 _PyUnicode_WSTR(unicode)[0] = 0;
663 _PyUnicode_WSTR(unicode)[length] = 0;
664 _PyUnicode_WSTR_LENGTH(unicode) = length;
665 _PyUnicode_HASH(unicode) = -1;
666 _PyUnicode_STATE(unicode).interned = 0;
667 _PyUnicode_STATE(unicode).kind = 0;
668 _PyUnicode_STATE(unicode).compact = 0;
669 _PyUnicode_STATE(unicode).ready = 0;
670 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200671 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200673 _PyUnicode_UTF8(unicode) = NULL;
674 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000675 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000676
Benjamin Peterson29060642009-01-31 22:14:21 +0000677 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000678 /* XXX UNREF/NEWREF interface should be more symmetrical */
679 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000680 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000681 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683}
684
Victor Stinnerf42dc442011-10-02 23:33:16 +0200685static const char*
686unicode_kind_name(PyObject *unicode)
687{
Victor Stinner42dfd712011-10-03 14:41:45 +0200688 /* don't check consistency: unicode_kind_name() is called from
689 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200690 if (!PyUnicode_IS_COMPACT(unicode))
691 {
692 if (!PyUnicode_IS_READY(unicode))
693 return "wstr";
694 switch(PyUnicode_KIND(unicode))
695 {
696 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200697 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200698 return "legacy ascii";
699 else
700 return "legacy latin1";
701 case PyUnicode_2BYTE_KIND:
702 return "legacy UCS2";
703 case PyUnicode_4BYTE_KIND:
704 return "legacy UCS4";
705 default:
706 return "<legacy invalid kind>";
707 }
708 }
709 assert(PyUnicode_IS_READY(unicode));
710 switch(PyUnicode_KIND(unicode))
711 {
712 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200714 return "ascii";
715 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200716 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200717 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200718 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200719 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200720 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200721 default:
722 return "<invalid compact kind>";
723 }
724}
725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#ifdef Py_DEBUG
727int unicode_new_new_calls = 0;
728
729/* Functions wrapping macros for use in debugger */
730char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200731 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732}
733
734void *_PyUnicode_compact_data(void *unicode) {
735 return _PyUnicode_COMPACT_DATA(unicode);
736}
737void *_PyUnicode_data(void *unicode){
738 printf("obj %p\n", unicode);
739 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
740 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
741 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
742 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
743 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
744 return PyUnicode_DATA(unicode);
745}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200746
747void
748_PyUnicode_Dump(PyObject *op)
749{
750 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200751 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
752 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
753 void *data;
754 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
755 if (ascii->state.compact)
756 data = (compact + 1);
757 else
758 data = unicode->data.any;
759 if (ascii->wstr == data)
760 printf("shared ");
761 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200762 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200763 printf(" (%zu), ", compact->wstr_length);
764 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
765 printf("shared ");
766 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200767 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200768 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200769}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770#endif
771
772PyObject *
773PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
774{
775 PyObject *obj;
776 PyCompactUnicodeObject *unicode;
777 void *data;
778 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200779 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200780 Py_ssize_t char_size;
781 Py_ssize_t struct_size;
782
783 /* Optimization for empty strings */
784 if (size == 0 && unicode_empty != NULL) {
785 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200786 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787 }
788
789#ifdef Py_DEBUG
790 ++unicode_new_new_calls;
791#endif
792
Victor Stinner9e9d6892011-10-04 01:02:02 +0200793 is_ascii = 0;
794 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 struct_size = sizeof(PyCompactUnicodeObject);
796 if (maxchar < 128) {
797 kind_state = PyUnicode_1BYTE_KIND;
798 char_size = 1;
799 is_ascii = 1;
800 struct_size = sizeof(PyASCIIObject);
801 }
802 else if (maxchar < 256) {
803 kind_state = PyUnicode_1BYTE_KIND;
804 char_size = 1;
805 }
806 else if (maxchar < 65536) {
807 kind_state = PyUnicode_2BYTE_KIND;
808 char_size = 2;
809 if (sizeof(wchar_t) == 2)
810 is_sharing = 1;
811 }
812 else {
813 kind_state = PyUnicode_4BYTE_KIND;
814 char_size = 4;
815 if (sizeof(wchar_t) == 4)
816 is_sharing = 1;
817 }
818
819 /* Ensure we won't overflow the size. */
820 if (size < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to PyUnicode_New");
823 return NULL;
824 }
825 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
826 return PyErr_NoMemory();
827
828 /* Duplicated allocation code from _PyObject_New() instead of a call to
829 * PyObject_New() so we are able to allocate space for the object and
830 * it's data buffer.
831 */
832 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
833 if (obj == NULL)
834 return PyErr_NoMemory();
835 obj = PyObject_INIT(obj, &PyUnicode_Type);
836 if (obj == NULL)
837 return NULL;
838
839 unicode = (PyCompactUnicodeObject *)obj;
840 if (is_ascii)
841 data = ((PyASCIIObject*)obj) + 1;
842 else
843 data = unicode + 1;
844 _PyUnicode_LENGTH(unicode) = size;
845 _PyUnicode_HASH(unicode) = -1;
846 _PyUnicode_STATE(unicode).interned = 0;
847 _PyUnicode_STATE(unicode).kind = kind_state;
848 _PyUnicode_STATE(unicode).compact = 1;
849 _PyUnicode_STATE(unicode).ready = 1;
850 _PyUnicode_STATE(unicode).ascii = is_ascii;
851 if (is_ascii) {
852 ((char*)data)[size] = 0;
853 _PyUnicode_WSTR(unicode) = NULL;
854 }
855 else if (kind_state == PyUnicode_1BYTE_KIND) {
856 ((char*)data)[size] = 0;
857 _PyUnicode_WSTR(unicode) = NULL;
858 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200860 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 }
862 else {
863 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200864 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 if (kind_state == PyUnicode_2BYTE_KIND)
866 ((Py_UCS2*)data)[size] = 0;
867 else /* kind_state == PyUnicode_4BYTE_KIND */
868 ((Py_UCS4*)data)[size] = 0;
869 if (is_sharing) {
870 _PyUnicode_WSTR_LENGTH(unicode) = size;
871 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
872 }
873 else {
874 _PyUnicode_WSTR_LENGTH(unicode) = 0;
875 _PyUnicode_WSTR(unicode) = NULL;
876 }
877 }
878 return obj;
879}
880
881#if SIZEOF_WCHAR_T == 2
882/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
883 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200884 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885
886 This function assumes that unicode can hold one more code point than wstr
887 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200888static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
890 PyUnicodeObject *unicode)
891{
892 const wchar_t *iter;
893 Py_UCS4 *ucs4_out;
894
Victor Stinner910337b2011-10-03 03:20:16 +0200895 assert(unicode != NULL);
896 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
898 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
899
900 for (iter = begin; iter < end; ) {
901 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
902 _PyUnicode_GET_LENGTH(unicode)));
903 if (*iter >= 0xD800 && *iter <= 0xDBFF
904 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
905 {
906 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
907 iter += 2;
908 }
909 else {
910 *ucs4_out++ = *iter;
911 iter++;
912 }
913 }
914 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
915 _PyUnicode_GET_LENGTH(unicode)));
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917}
918#endif
919
Victor Stinnercd9950f2011-10-02 00:34:53 +0200920static int
921_PyUnicode_Dirty(PyObject *unicode)
922{
Victor Stinner910337b2011-10-03 03:20:16 +0200923 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200924 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200925 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200926 "Cannot modify a string having more than 1 reference");
927 return -1;
928 }
929 _PyUnicode_DIRTY(unicode);
930 return 0;
931}
932
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200933Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
935 PyObject *from, Py_ssize_t from_start,
936 Py_ssize_t how_many)
937{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200938 unsigned int from_kind, to_kind;
939 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940
Victor Stinnerb1536152011-09-30 02:26:10 +0200941 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
942 PyErr_BadInternalCall();
943 return -1;
944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945
946 if (PyUnicode_READY(from))
947 return -1;
948 if (PyUnicode_READY(to))
949 return -1;
950
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200951 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200952 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200953 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200954 "Cannot write %zi characters at %zi "
955 "in a string of %zi characters",
956 how_many, to_start, PyUnicode_GET_LENGTH(to));
957 return -1;
958 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200959 if (how_many == 0)
960 return 0;
961
Victor Stinnercd9950f2011-10-02 00:34:53 +0200962 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200963 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200966 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200968 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969
Victor Stinnerf42dc442011-10-02 23:33:16 +0200970 if (from_kind == to_kind
971 /* deny latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +0200972 && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200974 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200975 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200976 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200977 + PyUnicode_KIND_SIZE(from_kind, from_start),
978 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200980 else if (from_kind == PyUnicode_1BYTE_KIND
981 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200982 {
983 _PyUnicode_CONVERT_BYTES(
984 Py_UCS1, Py_UCS2,
985 PyUnicode_1BYTE_DATA(from) + from_start,
986 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
987 PyUnicode_2BYTE_DATA(to) + to_start
988 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200989 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200990 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200991 && to_kind == PyUnicode_4BYTE_KIND)
992 {
993 _PyUnicode_CONVERT_BYTES(
994 Py_UCS1, Py_UCS4,
995 PyUnicode_1BYTE_DATA(from) + from_start,
996 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
997 PyUnicode_4BYTE_DATA(to) + to_start
998 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200999 }
1000 else if (from_kind == PyUnicode_2BYTE_KIND
1001 && to_kind == PyUnicode_4BYTE_KIND)
1002 {
1003 _PyUnicode_CONVERT_BYTES(
1004 Py_UCS2, Py_UCS4,
1005 PyUnicode_2BYTE_DATA(from) + from_start,
1006 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1007 PyUnicode_4BYTE_DATA(to) + to_start
1008 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 else {
1011 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001012
1013 /* check if max_char(from substring) <= max_char(to) */
1014 if (from_kind > to_kind
1015 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001016 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001017 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001018 /* slow path to check for character overflow */
1019 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1020 Py_UCS4 ch, maxchar;
1021 Py_ssize_t i;
1022
1023 maxchar = 0;
1024 invalid_kinds = 0;
1025 for (i=0; i < how_many; i++) {
1026 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1027 if (ch > maxchar) {
1028 maxchar = ch;
1029 if (maxchar > to_maxchar) {
1030 invalid_kinds = 1;
1031 break;
1032 }
1033 }
1034 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1035 }
1036 }
1037 else
1038 invalid_kinds = 1;
1039 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001040 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001041 "Cannot copy %s characters "
1042 "into a string of %s characters",
1043 unicode_kind_name(from),
1044 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001045 return -1;
1046 }
1047 }
1048 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049}
1050
Victor Stinner17222162011-09-28 22:15:37 +02001051/* Find the maximum code point and count the number of surrogate pairs so a
1052 correct string length can be computed before converting a string to UCS4.
1053 This function counts single surrogates as a character and not as a pair.
1054
1055 Return 0 on success, or -1 on error. */
1056static int
1057find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1058 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059{
1060 const wchar_t *iter;
1061
Victor Stinnerc53be962011-10-02 21:33:54 +02001062 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 *num_surrogates = 0;
1064 *maxchar = 0;
1065
1066 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001067 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001069#if SIZEOF_WCHAR_T != 2
1070 if (*maxchar >= 0x10000)
1071 return 0;
1072#endif
1073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074#if SIZEOF_WCHAR_T == 2
1075 if (*iter >= 0xD800 && *iter <= 0xDBFF
1076 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1077 {
1078 Py_UCS4 surrogate_val;
1079 surrogate_val = (((iter[0] & 0x3FF)<<10)
1080 | (iter[1] & 0x3FF)) + 0x10000;
1081 ++(*num_surrogates);
1082 if (surrogate_val > *maxchar)
1083 *maxchar = surrogate_val;
1084 iter += 2;
1085 }
1086 else
1087 iter++;
1088#else
1089 iter++;
1090#endif
1091 }
1092 return 0;
1093}
1094
1095#ifdef Py_DEBUG
1096int unicode_ready_calls = 0;
1097#endif
1098
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001099static int
1100unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001102 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 wchar_t *end;
1104 Py_UCS4 maxchar = 0;
1105 Py_ssize_t num_surrogates;
1106#if SIZEOF_WCHAR_T == 2
1107 Py_ssize_t length_wo_surrogates;
1108#endif
1109
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001110 assert(p_obj != NULL);
1111 unicode = (PyUnicodeObject *)*p_obj;
1112
Georg Brandl7597add2011-10-05 16:36:47 +02001113 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001114 strings were created using _PyObject_New() and where no canonical
1115 representation (the str field) has been set yet aka strings
1116 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001117 assert(_PyUnicode_CHECK(unicode));
1118 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001120 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001121 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001122 /* Actually, it should neither be interned nor be anything else: */
1123 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124
1125#ifdef Py_DEBUG
1126 ++unicode_ready_calls;
1127#endif
1128
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001129#ifdef Py_DEBUG
1130 assert(!replace || Py_REFCNT(unicode) == 1);
1131#else
1132 if (replace && Py_REFCNT(unicode) != 1)
1133 replace = 0;
1134#endif
1135 if (replace) {
1136 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1137 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1138 /* Optimization for empty strings */
1139 if (len == 0) {
1140 Py_INCREF(unicode_empty);
1141 Py_DECREF(*p_obj);
1142 *p_obj = unicode_empty;
1143 return 0;
1144 }
1145 if (len == 1 && wstr[0] < 256) {
1146 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1147 if (latin1_char == NULL)
1148 return -1;
1149 Py_DECREF(*p_obj);
1150 *p_obj = latin1_char;
1151 return 0;
1152 }
1153 }
1154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001156 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001157 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159
1160 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001161 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1162 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001166 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 _PyUnicode_WSTR(unicode), end,
1168 PyUnicode_1BYTE_DATA(unicode));
1169 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1170 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1171 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1172 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001173 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001174 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001175 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 }
1177 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001178 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001179 _PyUnicode_UTF8(unicode) = NULL;
1180 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181 }
1182 PyObject_FREE(_PyUnicode_WSTR(unicode));
1183 _PyUnicode_WSTR(unicode) = NULL;
1184 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1185 }
1186 /* In this case we might have to convert down from 4-byte native
1187 wchar_t to 2-byte unicode. */
1188 else if (maxchar < 65536) {
1189 assert(num_surrogates == 0 &&
1190 "FindMaxCharAndNumSurrogatePairs() messed up");
1191
Victor Stinner506f5922011-09-28 22:34:18 +02001192#if SIZEOF_WCHAR_T == 2
1193 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001194 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001195 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1196 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1197 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001198 _PyUnicode_UTF8(unicode) = NULL;
1199 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001200#else
1201 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001202 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001203 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001204 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001205 PyErr_NoMemory();
1206 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 }
Victor Stinner506f5922011-09-28 22:34:18 +02001208 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1209 _PyUnicode_WSTR(unicode), end,
1210 PyUnicode_2BYTE_DATA(unicode));
1211 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1212 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1213 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001214 _PyUnicode_UTF8(unicode) = NULL;
1215 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001216 PyObject_FREE(_PyUnicode_WSTR(unicode));
1217 _PyUnicode_WSTR(unicode) = NULL;
1218 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1219#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220 }
1221 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1222 else {
1223#if SIZEOF_WCHAR_T == 2
1224 /* in case the native representation is 2-bytes, we need to allocate a
1225 new normalized 4-byte version. */
1226 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001227 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1228 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001229 PyErr_NoMemory();
1230 return -1;
1231 }
1232 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1233 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001234 _PyUnicode_UTF8(unicode) = NULL;
1235 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001236 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1237 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001238 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 PyObject_FREE(_PyUnicode_WSTR(unicode));
1240 _PyUnicode_WSTR(unicode) = NULL;
1241 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1242#else
1243 assert(num_surrogates == 0);
1244
Victor Stinnerc3c74152011-10-02 20:39:55 +02001245 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001247 _PyUnicode_UTF8(unicode) = NULL;
1248 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1250#endif
1251 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1252 }
1253 _PyUnicode_STATE(unicode).ready = 1;
1254 return 0;
1255}
1256
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001257int
1258_PyUnicode_ReadyReplace(PyObject **op)
1259{
1260 return unicode_ready(op, 1);
1261}
1262
1263int
1264_PyUnicode_Ready(PyObject *op)
1265{
1266 return unicode_ready(&op, 0);
1267}
1268
Alexander Belopolsky40018472011-02-26 01:02:56 +00001269static void
1270unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271{
Walter Dörwald16807132007-05-25 13:52:07 +00001272 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001273 case SSTATE_NOT_INTERNED:
1274 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001275
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 case SSTATE_INTERNED_MORTAL:
1277 /* revive dead object temporarily for DelItem */
1278 Py_REFCNT(unicode) = 3;
1279 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1280 Py_FatalError(
1281 "deletion of interned string failed");
1282 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001283
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 case SSTATE_INTERNED_IMMORTAL:
1285 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001286
Benjamin Peterson29060642009-01-31 22:14:21 +00001287 default:
1288 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001289 }
1290
Victor Stinner03490912011-10-03 23:45:12 +02001291 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001292 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001293 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001294 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001295
1296 if (PyUnicode_IS_COMPACT(unicode)) {
1297 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 }
1299 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001300 if (_PyUnicode_DATA_ANY(unicode))
1301 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001302 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 }
1304}
1305
Alexander Belopolsky40018472011-02-26 01:02:56 +00001306static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001307unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001308{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001309 if (Py_REFCNT(unicode) != 1)
1310 return 0;
1311 if (PyUnicode_CHECK_INTERNED(unicode))
1312 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001313 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001314#ifdef Py_DEBUG
1315 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1316 && PyUnicode_GET_LENGTH(unicode) == 1)
1317 {
1318 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001319 if (ch < 256 && unicode_latin1[ch] == unicode)
1320 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001321 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001322#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001323 return 1;
1324}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001325
Victor Stinnerfe226c02011-10-03 03:52:20 +02001326static int
1327unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1328{
1329 PyObject *unicode;
1330 Py_ssize_t old_length;
1331
1332 assert(p_unicode != NULL);
1333 unicode = *p_unicode;
1334
1335 assert(unicode != NULL);
1336 assert(PyUnicode_Check(unicode));
1337 assert(0 <= length);
1338
Victor Stinner910337b2011-10-03 03:20:16 +02001339 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001340 old_length = PyUnicode_WSTR_LENGTH(unicode);
1341 else
1342 old_length = PyUnicode_GET_LENGTH(unicode);
1343 if (old_length == length)
1344 return 0;
1345
Victor Stinnerfe226c02011-10-03 03:52:20 +02001346 if (!unicode_resizable(unicode)) {
1347 PyObject *copy = resize_copy(unicode, length);
1348 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001349 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350 Py_DECREF(*p_unicode);
1351 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001352 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001353 }
1354
Victor Stinnerfe226c02011-10-03 03:52:20 +02001355 if (PyUnicode_IS_COMPACT(unicode)) {
1356 *p_unicode = resize_compact(unicode, length);
1357 if (*p_unicode == NULL)
1358 return -1;
Benjamin Petersonccc51c12011-10-03 19:34:12 -04001359 _PyUnicode_CheckConsistency(*p_unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001360 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001361 }
1362 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001363}
1364
Alexander Belopolsky40018472011-02-26 01:02:56 +00001365int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001366PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001367{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001368 PyObject *unicode;
1369 if (p_unicode == NULL) {
1370 PyErr_BadInternalCall();
1371 return -1;
1372 }
1373 unicode = *p_unicode;
1374 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1375 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1376 {
1377 PyErr_BadInternalCall();
1378 return -1;
1379 }
1380 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001381}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383static PyObject*
1384get_latin1_char(unsigned char ch)
1385{
Victor Stinnera464fc12011-10-02 20:39:30 +02001386 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001388 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 if (!unicode)
1390 return NULL;
1391 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1392 unicode_latin1[ch] = unicode;
1393 }
1394 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001395 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396}
1397
Alexander Belopolsky40018472011-02-26 01:02:56 +00001398PyObject *
1399PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400{
1401 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 Py_UCS4 maxchar = 0;
1403 Py_ssize_t num_surrogates;
1404
1405 if (u == NULL)
1406 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001407
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001408 /* If the Unicode data is known at construction time, we can apply
1409 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 /* Optimization for empty strings */
1412 if (size == 0 && unicode_empty != NULL) {
1413 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001414 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001415 }
Tim Petersced69f82003-09-16 20:30:58 +00001416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 /* Single character Unicode objects in the Latin-1 range are
1418 shared when using this constructor */
1419 if (size == 1 && *u < 256)
1420 return get_latin1_char((unsigned char)*u);
1421
1422 /* If not empty and not single character, copy the Unicode data
1423 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001424 if (find_maxchar_surrogates(u, u + size,
1425 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 return NULL;
1427
1428 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1429 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430 if (!unicode)
1431 return NULL;
1432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 switch (PyUnicode_KIND(unicode)) {
1434 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001435 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1437 break;
1438 case PyUnicode_2BYTE_KIND:
1439#if Py_UNICODE_SIZE == 2
1440 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1441#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001442 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1444#endif
1445 break;
1446 case PyUnicode_4BYTE_KIND:
1447#if SIZEOF_WCHAR_T == 2
1448 /* This is the only case which has to process surrogates, thus
1449 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001450 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451#else
1452 assert(num_surrogates == 0);
1453 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1454#endif
1455 break;
1456 default:
1457 assert(0 && "Impossible state");
1458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459
1460 return (PyObject *)unicode;
1461}
1462
Alexander Belopolsky40018472011-02-26 01:02:56 +00001463PyObject *
1464PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001465{
1466 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001467
Benjamin Peterson14339b62009-01-31 16:36:08 +00001468 if (size < 0) {
1469 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001470 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001471 return NULL;
1472 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001473
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001474 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001475 some optimizations which share commonly used objects.
1476 Also, this means the input must be UTF-8, so fall back to the
1477 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001478 if (u != NULL) {
1479
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 /* Optimization for empty strings */
1481 if (size == 0 && unicode_empty != NULL) {
1482 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001483 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001484 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001485
1486 /* Single characters are shared when using this constructor.
1487 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 if (size == 1 && Py_CHARMASK(*u) < 128)
1489 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001490
1491 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001492 }
1493
Walter Dörwald55507312007-05-18 13:12:10 +00001494 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001495 if (!unicode)
1496 return NULL;
1497
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001498 return (PyObject *)unicode;
1499}
1500
Alexander Belopolsky40018472011-02-26 01:02:56 +00001501PyObject *
1502PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001503{
1504 size_t size = strlen(u);
1505 if (size > PY_SSIZE_T_MAX) {
1506 PyErr_SetString(PyExc_OverflowError, "input too long");
1507 return NULL;
1508 }
1509
1510 return PyUnicode_FromStringAndSize(u, size);
1511}
1512
Victor Stinnere57b1c02011-09-28 22:20:48 +02001513static PyObject*
Victor Stinner702c7342011-10-05 13:50:52 +02001514unicode_fromascii(const unsigned char* u, Py_ssize_t size)
1515{
1516 PyObject *res = PyUnicode_New(size, 127);
1517 if (!res)
1518 return NULL;
1519 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1520 return res;
1521}
1522
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001523static Py_UCS4
1524kind_maxchar_limit(unsigned int kind)
1525{
1526 switch(kind) {
1527 case PyUnicode_1BYTE_KIND:
1528 return 0x80;
1529 case PyUnicode_2BYTE_KIND:
1530 return 0x100;
1531 case PyUnicode_4BYTE_KIND:
1532 return 0x10000;
1533 default:
1534 assert(0 && "invalid kind");
1535 return 0x10ffff;
1536 }
1537}
1538
Victor Stinner702c7342011-10-05 13:50:52 +02001539static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001540_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001541{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001543 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001545
1546 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 for (i = 0; i < size; i++) {
1548 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001549 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001551 }
1552 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001553 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554 if (!res)
1555 return NULL;
1556 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1557 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001558}
1559
Victor Stinnere57b1c02011-09-28 22:20:48 +02001560static PyObject*
1561_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562{
1563 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001564 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001566
1567 assert(size >= 0);
1568 for (i = 0; i < size; i++) {
1569 if (u[i] > max_char) {
1570 max_char = u[i];
1571 if (max_char >= 256)
1572 break;
1573 }
1574 }
1575 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576 if (!res)
1577 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001578 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001579 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1580 else
1581 for (i = 0; i < size; i++)
1582 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1583 return res;
1584}
1585
Victor Stinnere57b1c02011-09-28 22:20:48 +02001586static PyObject*
1587_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588{
1589 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001590 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001591 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001592
1593 assert(size >= 0);
1594 for (i = 0; i < size; i++) {
1595 if (u[i] > max_char) {
1596 max_char = u[i];
1597 if (max_char >= 0x10000)
1598 break;
1599 }
1600 }
1601 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602 if (!res)
1603 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001604 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001605 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1606 else {
1607 int kind = PyUnicode_KIND(res);
1608 void *data = PyUnicode_DATA(res);
1609 for (i = 0; i < size; i++)
1610 PyUnicode_WRITE(kind, data, i, u[i]);
1611 }
1612 return res;
1613}
1614
1615PyObject*
1616PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1617{
1618 switch(kind) {
1619 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001620 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001621 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001622 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001624 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001625 default:
1626 assert(0 && "invalid kind");
1627 PyErr_SetString(PyExc_SystemError, "invalid kind");
1628 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001630}
1631
Victor Stinner034f6cf2011-09-30 02:26:44 +02001632PyObject*
1633PyUnicode_Copy(PyObject *unicode)
1634{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001635 Py_ssize_t size;
1636 PyObject *copy;
1637 void *data;
1638
Victor Stinner034f6cf2011-09-30 02:26:44 +02001639 if (!PyUnicode_Check(unicode)) {
1640 PyErr_BadInternalCall();
1641 return NULL;
1642 }
1643 if (PyUnicode_READY(unicode))
1644 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001645
1646 size = PyUnicode_GET_LENGTH(unicode);
1647 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1648 if (!copy)
1649 return NULL;
1650 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1651
1652 data = PyUnicode_DATA(unicode);
1653 switch (PyUnicode_KIND(unicode))
1654 {
1655 case PyUnicode_1BYTE_KIND:
1656 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1657 break;
1658 case PyUnicode_2BYTE_KIND:
1659 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1660 break;
1661 case PyUnicode_4BYTE_KIND:
1662 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1663 break;
1664 default:
1665 assert(0);
1666 break;
1667 }
1668 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001669}
1670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671
Victor Stinnerbc603d12011-10-02 01:00:40 +02001672/* Widen Unicode objects to larger buffers. Don't write terminating null
1673 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674
1675void*
1676_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1677{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001678 Py_ssize_t len;
1679 void *result;
1680 unsigned int skind;
1681
1682 if (PyUnicode_READY(s))
1683 return NULL;
1684
1685 len = PyUnicode_GET_LENGTH(s);
1686 skind = PyUnicode_KIND(s);
1687 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001688 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689 return NULL;
1690 }
1691 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001692 case PyUnicode_2BYTE_KIND:
1693 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1694 if (!result)
1695 return PyErr_NoMemory();
1696 assert(skind == PyUnicode_1BYTE_KIND);
1697 _PyUnicode_CONVERT_BYTES(
1698 Py_UCS1, Py_UCS2,
1699 PyUnicode_1BYTE_DATA(s),
1700 PyUnicode_1BYTE_DATA(s) + len,
1701 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001703 case PyUnicode_4BYTE_KIND:
1704 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1705 if (!result)
1706 return PyErr_NoMemory();
1707 if (skind == PyUnicode_2BYTE_KIND) {
1708 _PyUnicode_CONVERT_BYTES(
1709 Py_UCS2, Py_UCS4,
1710 PyUnicode_2BYTE_DATA(s),
1711 PyUnicode_2BYTE_DATA(s) + len,
1712 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001714 else {
1715 assert(skind == PyUnicode_1BYTE_KIND);
1716 _PyUnicode_CONVERT_BYTES(
1717 Py_UCS1, Py_UCS4,
1718 PyUnicode_1BYTE_DATA(s),
1719 PyUnicode_1BYTE_DATA(s) + len,
1720 result);
1721 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001723 default:
1724 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 }
Victor Stinner01698042011-10-04 00:04:26 +02001726 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727 return NULL;
1728}
1729
1730static Py_UCS4*
1731as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1732 int copy_null)
1733{
1734 int kind;
1735 void *data;
1736 Py_ssize_t len, targetlen;
1737 if (PyUnicode_READY(string) == -1)
1738 return NULL;
1739 kind = PyUnicode_KIND(string);
1740 data = PyUnicode_DATA(string);
1741 len = PyUnicode_GET_LENGTH(string);
1742 targetlen = len;
1743 if (copy_null)
1744 targetlen++;
1745 if (!target) {
1746 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1747 PyErr_NoMemory();
1748 return NULL;
1749 }
1750 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1751 if (!target) {
1752 PyErr_NoMemory();
1753 return NULL;
1754 }
1755 }
1756 else {
1757 if (targetsize < targetlen) {
1758 PyErr_Format(PyExc_SystemError,
1759 "string is longer than the buffer");
1760 if (copy_null && 0 < targetsize)
1761 target[0] = 0;
1762 return NULL;
1763 }
1764 }
1765 if (kind != PyUnicode_4BYTE_KIND) {
1766 Py_ssize_t i;
1767 for (i = 0; i < len; i++)
1768 target[i] = PyUnicode_READ(kind, data, i);
1769 }
1770 else
1771 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1772 if (copy_null)
1773 target[len] = 0;
1774 return target;
1775}
1776
1777Py_UCS4*
1778PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1779 int copy_null)
1780{
1781 if (target == NULL || targetsize < 1) {
1782 PyErr_BadInternalCall();
1783 return NULL;
1784 }
1785 return as_ucs4(string, target, targetsize, copy_null);
1786}
1787
1788Py_UCS4*
1789PyUnicode_AsUCS4Copy(PyObject *string)
1790{
1791 return as_ucs4(string, NULL, 0, 1);
1792}
1793
1794#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001795
Alexander Belopolsky40018472011-02-26 01:02:56 +00001796PyObject *
1797PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001800 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001802 PyErr_BadInternalCall();
1803 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804 }
1805
Martin v. Löwis790465f2008-04-05 20:41:37 +00001806 if (size == -1) {
1807 size = wcslen(w);
1808 }
1809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811}
1812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001814
Walter Dörwald346737f2007-05-31 10:44:43 +00001815static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001816makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1817 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001818{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001819 *fmt++ = '%';
1820 if (width) {
1821 if (zeropad)
1822 *fmt++ = '0';
1823 fmt += sprintf(fmt, "%d", width);
1824 }
1825 if (precision)
1826 fmt += sprintf(fmt, ".%d", precision);
1827 if (longflag)
1828 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001829 else if (longlongflag) {
1830 /* longlongflag should only ever be nonzero on machines with
1831 HAVE_LONG_LONG defined */
1832#ifdef HAVE_LONG_LONG
1833 char *f = PY_FORMAT_LONG_LONG;
1834 while (*f)
1835 *fmt++ = *f++;
1836#else
1837 /* we shouldn't ever get here */
1838 assert(0);
1839 *fmt++ = 'l';
1840#endif
1841 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001842 else if (size_tflag) {
1843 char *f = PY_FORMAT_SIZE_T;
1844 while (*f)
1845 *fmt++ = *f++;
1846 }
1847 *fmt++ = c;
1848 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001849}
1850
Victor Stinner96865452011-03-01 23:44:09 +00001851/* helper for PyUnicode_FromFormatV() */
1852
1853static const char*
1854parse_format_flags(const char *f,
1855 int *p_width, int *p_precision,
1856 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1857{
1858 int width, precision, longflag, longlongflag, size_tflag;
1859
1860 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1861 f++;
1862 width = 0;
1863 while (Py_ISDIGIT((unsigned)*f))
1864 width = (width*10) + *f++ - '0';
1865 precision = 0;
1866 if (*f == '.') {
1867 f++;
1868 while (Py_ISDIGIT((unsigned)*f))
1869 precision = (precision*10) + *f++ - '0';
1870 if (*f == '%') {
1871 /* "%.3%s" => f points to "3" */
1872 f--;
1873 }
1874 }
1875 if (*f == '\0') {
1876 /* bogus format "%.1" => go backward, f points to "1" */
1877 f--;
1878 }
1879 if (p_width != NULL)
1880 *p_width = width;
1881 if (p_precision != NULL)
1882 *p_precision = precision;
1883
1884 /* Handle %ld, %lu, %lld and %llu. */
1885 longflag = 0;
1886 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001887 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001888
1889 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001890 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001891 longflag = 1;
1892 ++f;
1893 }
1894#ifdef HAVE_LONG_LONG
1895 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001896 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001897 longlongflag = 1;
1898 f += 2;
1899 }
1900#endif
1901 }
1902 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001903 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001904 size_tflag = 1;
1905 ++f;
1906 }
1907 if (p_longflag != NULL)
1908 *p_longflag = longflag;
1909 if (p_longlongflag != NULL)
1910 *p_longlongflag = longlongflag;
1911 if (p_size_tflag != NULL)
1912 *p_size_tflag = size_tflag;
1913 return f;
1914}
1915
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001916/* maximum number of characters required for output of %ld. 21 characters
1917 allows for 64-bit integers (in decimal) and an optional sign. */
1918#define MAX_LONG_CHARS 21
1919/* maximum number of characters required for output of %lld.
1920 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1921 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1922#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1923
Walter Dörwaldd2034312007-05-18 16:29:38 +00001924PyObject *
1925PyUnicode_FromFormatV(const char *format, va_list vargs)
1926{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001927 va_list count;
1928 Py_ssize_t callcount = 0;
1929 PyObject **callresults = NULL;
1930 PyObject **callresult = NULL;
1931 Py_ssize_t n = 0;
1932 int width = 0;
1933 int precision = 0;
1934 int zeropad;
1935 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001937 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001938 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1940 Py_UCS4 argmaxchar;
1941 Py_ssize_t numbersize = 0;
1942 char *numberresults = NULL;
1943 char *numberresult = NULL;
1944 Py_ssize_t i;
1945 int kind;
1946 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001947
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001948 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001949 /* step 1: count the number of %S/%R/%A/%s format specifications
1950 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1951 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02001953 * also estimate a upper bound for all the number formats in the string,
1954 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001956 for (f = format; *f; f++) {
1957 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001958 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1960 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1961 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1962 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001965#ifdef HAVE_LONG_LONG
1966 if (longlongflag) {
1967 if (width < MAX_LONG_LONG_CHARS)
1968 width = MAX_LONG_LONG_CHARS;
1969 }
1970 else
1971#endif
1972 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1973 including sign. Decimal takes the most space. This
1974 isn't enough for octal. If a width is specified we
1975 need more (which we allocate later). */
1976 if (width < MAX_LONG_CHARS)
1977 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978
1979 /* account for the size + '\0' to separate numbers
1980 inside of the numberresults buffer */
1981 numbersize += (width + 1);
1982 }
1983 }
1984 else if ((unsigned char)*f > 127) {
1985 PyErr_Format(PyExc_ValueError,
1986 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1987 "string, got a non-ASCII byte: 0x%02x",
1988 (unsigned char)*f);
1989 return NULL;
1990 }
1991 }
1992 /* step 2: allocate memory for the results of
1993 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1994 if (callcount) {
1995 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1996 if (!callresults) {
1997 PyErr_NoMemory();
1998 return NULL;
1999 }
2000 callresult = callresults;
2001 }
2002 /* step 2.5: allocate memory for the results of formating numbers */
2003 if (numbersize) {
2004 numberresults = PyObject_Malloc(numbersize);
2005 if (!numberresults) {
2006 PyErr_NoMemory();
2007 goto fail;
2008 }
2009 numberresult = numberresults;
2010 }
2011
2012 /* step 3: format numbers and figure out how large a buffer we need */
2013 for (f = format; *f; f++) {
2014 if (*f == '%') {
2015 const char* p;
2016 int longflag;
2017 int longlongflag;
2018 int size_tflag;
2019 int numprinted;
2020
2021 p = f;
2022 zeropad = (f[1] == '0');
2023 f = parse_format_flags(f, &width, &precision,
2024 &longflag, &longlongflag, &size_tflag);
2025 switch (*f) {
2026 case 'c':
2027 {
2028 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002029 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 n++;
2031 break;
2032 }
2033 case '%':
2034 n++;
2035 break;
2036 case 'i':
2037 case 'd':
2038 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2039 width, precision, *f);
2040 if (longflag)
2041 numprinted = sprintf(numberresult, fmt,
2042 va_arg(count, long));
2043#ifdef HAVE_LONG_LONG
2044 else if (longlongflag)
2045 numprinted = sprintf(numberresult, fmt,
2046 va_arg(count, PY_LONG_LONG));
2047#endif
2048 else if (size_tflag)
2049 numprinted = sprintf(numberresult, fmt,
2050 va_arg(count, Py_ssize_t));
2051 else
2052 numprinted = sprintf(numberresult, fmt,
2053 va_arg(count, int));
2054 n += numprinted;
2055 /* advance by +1 to skip over the '\0' */
2056 numberresult += (numprinted + 1);
2057 assert(*(numberresult - 1) == '\0');
2058 assert(*(numberresult - 2) != '\0');
2059 assert(numprinted >= 0);
2060 assert(numberresult <= numberresults + numbersize);
2061 break;
2062 case 'u':
2063 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2064 width, precision, 'u');
2065 if (longflag)
2066 numprinted = sprintf(numberresult, fmt,
2067 va_arg(count, unsigned long));
2068#ifdef HAVE_LONG_LONG
2069 else if (longlongflag)
2070 numprinted = sprintf(numberresult, fmt,
2071 va_arg(count, unsigned PY_LONG_LONG));
2072#endif
2073 else if (size_tflag)
2074 numprinted = sprintf(numberresult, fmt,
2075 va_arg(count, size_t));
2076 else
2077 numprinted = sprintf(numberresult, fmt,
2078 va_arg(count, unsigned int));
2079 n += numprinted;
2080 numberresult += (numprinted + 1);
2081 assert(*(numberresult - 1) == '\0');
2082 assert(*(numberresult - 2) != '\0');
2083 assert(numprinted >= 0);
2084 assert(numberresult <= numberresults + numbersize);
2085 break;
2086 case 'x':
2087 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2088 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2089 n += numprinted;
2090 numberresult += (numprinted + 1);
2091 assert(*(numberresult - 1) == '\0');
2092 assert(*(numberresult - 2) != '\0');
2093 assert(numprinted >= 0);
2094 assert(numberresult <= numberresults + numbersize);
2095 break;
2096 case 'p':
2097 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2098 /* %p is ill-defined: ensure leading 0x. */
2099 if (numberresult[1] == 'X')
2100 numberresult[1] = 'x';
2101 else if (numberresult[1] != 'x') {
2102 memmove(numberresult + 2, numberresult,
2103 strlen(numberresult) + 1);
2104 numberresult[0] = '0';
2105 numberresult[1] = 'x';
2106 numprinted += 2;
2107 }
2108 n += numprinted;
2109 numberresult += (numprinted + 1);
2110 assert(*(numberresult - 1) == '\0');
2111 assert(*(numberresult - 2) != '\0');
2112 assert(numprinted >= 0);
2113 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002114 break;
2115 case 's':
2116 {
2117 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002118 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002119 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2120 if (!str)
2121 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122 /* since PyUnicode_DecodeUTF8 returns already flexible
2123 unicode objects, there is no need to call ready on them */
2124 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002125 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002127 /* Remember the str and switch to the next slot */
2128 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002129 break;
2130 }
2131 case 'U':
2132 {
2133 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002134 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135 if (PyUnicode_READY(obj) == -1)
2136 goto fail;
2137 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002138 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002140 break;
2141 }
2142 case 'V':
2143 {
2144 PyObject *obj = va_arg(count, PyObject *);
2145 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002146 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002147 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002148 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002149 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002150 if (PyUnicode_READY(obj) == -1)
2151 goto fail;
2152 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002153 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002155 *callresult++ = NULL;
2156 }
2157 else {
2158 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2159 if (!str_obj)
2160 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002161 if (PyUnicode_READY(str_obj)) {
2162 Py_DECREF(str_obj);
2163 goto fail;
2164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002166 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002168 *callresult++ = str_obj;
2169 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002170 break;
2171 }
2172 case 'S':
2173 {
2174 PyObject *obj = va_arg(count, PyObject *);
2175 PyObject *str;
2176 assert(obj);
2177 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002179 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002181 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002183 /* Remember the str and switch to the next slot */
2184 *callresult++ = str;
2185 break;
2186 }
2187 case 'R':
2188 {
2189 PyObject *obj = va_arg(count, PyObject *);
2190 PyObject *repr;
2191 assert(obj);
2192 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002194 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002196 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 /* Remember the repr and switch to the next slot */
2199 *callresult++ = repr;
2200 break;
2201 }
2202 case 'A':
2203 {
2204 PyObject *obj = va_arg(count, PyObject *);
2205 PyObject *ascii;
2206 assert(obj);
2207 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002209 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002211 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002213 /* Remember the repr and switch to the next slot */
2214 *callresult++ = ascii;
2215 break;
2216 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 default:
2218 /* if we stumble upon an unknown
2219 formatting code, copy the rest of
2220 the format string to the output
2221 string. (we cannot just skip the
2222 code, since there's no way to know
2223 what's in the argument list) */
2224 n += strlen(p);
2225 goto expand;
2226 }
2227 } else
2228 n++;
2229 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002230 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002231 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002233 we don't have to resize the string.
2234 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 if (!string)
2237 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 kind = PyUnicode_KIND(string);
2239 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002244 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002245 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002246
2247 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2249 /* checking for == because the last argument could be a empty
2250 string, which causes i to point to end, the assert at the end of
2251 the loop */
2252 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002253
Benjamin Peterson14339b62009-01-31 16:36:08 +00002254 switch (*f) {
2255 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002256 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257 const int ordinal = va_arg(vargs, int);
2258 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002259 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002260 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002261 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002262 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002263 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002264 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 case 'p':
2266 /* unused, since we already have the result */
2267 if (*f == 'p')
2268 (void) va_arg(vargs, void *);
2269 else
2270 (void) va_arg(vargs, int);
2271 /* extract the result from numberresults and append. */
2272 for (; *numberresult; ++i, ++numberresult)
2273 PyUnicode_WRITE(kind, data, i, *numberresult);
2274 /* skip over the separating '\0' */
2275 assert(*numberresult == '\0');
2276 numberresult++;
2277 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002278 break;
2279 case 's':
2280 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002281 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002283 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 size = PyUnicode_GET_LENGTH(*callresult);
2285 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002286 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2287 *callresult, 0,
2288 size) < 0)
2289 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002290 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002291 /* We're done with the unicode()/repr() => forget it */
2292 Py_DECREF(*callresult);
2293 /* switch to next unicode()/repr() result */
2294 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002295 break;
2296 }
2297 case 'U':
2298 {
2299 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002300 Py_ssize_t size;
2301 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2302 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002303 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2304 obj, 0,
2305 size) < 0)
2306 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002307 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002308 break;
2309 }
2310 case 'V':
2311 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002313 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002314 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002315 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002316 size = PyUnicode_GET_LENGTH(obj);
2317 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002318 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2319 obj, 0,
2320 size) < 0)
2321 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002323 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002324 size = PyUnicode_GET_LENGTH(*callresult);
2325 assert(PyUnicode_KIND(*callresult) <=
2326 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002327 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2328 *callresult,
2329 0, size) < 0)
2330 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002331 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002332 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002333 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002334 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002335 break;
2336 }
2337 case 'S':
2338 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002339 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002340 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002341 /* unused, since we already have the result */
2342 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002344 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2345 *callresult, 0,
2346 PyUnicode_GET_LENGTH(*callresult)) < 0)
2347 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002349 /* We're done with the unicode()/repr() => forget it */
2350 Py_DECREF(*callresult);
2351 /* switch to next unicode()/repr() result */
2352 ++callresult;
2353 break;
2354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002355 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002357 break;
2358 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359 for (; *p; ++p, ++i)
2360 PyUnicode_WRITE(kind, data, i, *p);
2361 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002362 goto end;
2363 }
Victor Stinner1205f272010-09-11 00:54:47 +00002364 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002365 else {
2366 assert(i < PyUnicode_GET_LENGTH(string));
2367 PyUnicode_WRITE(kind, data, i++, *f);
2368 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002370 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002371
Benjamin Peterson29060642009-01-31 22:14:21 +00002372 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002373 if (callresults)
2374 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 if (numberresults)
2376 PyObject_Free(numberresults);
2377 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002378 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002379 if (callresults) {
2380 PyObject **callresult2 = callresults;
2381 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002382 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002383 ++callresult2;
2384 }
2385 PyObject_Free(callresults);
2386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 if (numberresults)
2388 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002389 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002390}
2391
Walter Dörwaldd2034312007-05-18 16:29:38 +00002392PyObject *
2393PyUnicode_FromFormat(const char *format, ...)
2394{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002395 PyObject* ret;
2396 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002397
2398#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002399 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002400#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002402#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002403 ret = PyUnicode_FromFormatV(format, vargs);
2404 va_end(vargs);
2405 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002406}
2407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408#ifdef HAVE_WCHAR_H
2409
Victor Stinner5593d8a2010-10-02 11:11:27 +00002410/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2411 convert a Unicode object to a wide character string.
2412
Victor Stinnerd88d9832011-09-06 02:00:05 +02002413 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002414 character) required to convert the unicode object. Ignore size argument.
2415
Victor Stinnerd88d9832011-09-06 02:00:05 +02002416 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002417 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002418 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002419static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002420unicode_aswidechar(PyUnicodeObject *unicode,
2421 wchar_t *w,
2422 Py_ssize_t size)
2423{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002424 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002425 const wchar_t *wstr;
2426
2427 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2428 if (wstr == NULL)
2429 return -1;
2430
Victor Stinner5593d8a2010-10-02 11:11:27 +00002431 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002432 if (size > res)
2433 size = res + 1;
2434 else
2435 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002437 return res;
2438 }
2439 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002441}
2442
2443Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002444PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002445 wchar_t *w,
2446 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447{
2448 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002449 PyErr_BadInternalCall();
2450 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002452 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453}
2454
Victor Stinner137c34c2010-09-29 10:25:54 +00002455wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002456PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002457 Py_ssize_t *size)
2458{
2459 wchar_t* buffer;
2460 Py_ssize_t buflen;
2461
2462 if (unicode == NULL) {
2463 PyErr_BadInternalCall();
2464 return NULL;
2465 }
2466
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002467 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 if (buflen == -1)
2469 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002470 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002471 PyErr_NoMemory();
2472 return NULL;
2473 }
2474
Victor Stinner137c34c2010-09-29 10:25:54 +00002475 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2476 if (buffer == NULL) {
2477 PyErr_NoMemory();
2478 return NULL;
2479 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002480 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002481 if (buflen == -1)
2482 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002483 if (size != NULL)
2484 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002485 return buffer;
2486}
2487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489
Alexander Belopolsky40018472011-02-26 01:02:56 +00002490PyObject *
2491PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002494 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002495 PyErr_SetString(PyExc_ValueError,
2496 "chr() arg not in range(0x110000)");
2497 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002498 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500 if (ordinal < 256)
2501 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 v = PyUnicode_New(1, ordinal);
2504 if (v == NULL)
2505 return NULL;
2506 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2507 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002508}
2509
Alexander Belopolsky40018472011-02-26 01:02:56 +00002510PyObject *
2511PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002513 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002514 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002515 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002516 if (PyUnicode_READY(obj))
2517 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002518 Py_INCREF(obj);
2519 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002520 }
2521 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002522 /* For a Unicode subtype that's not a Unicode object,
2523 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002524 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002525 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002526 PyErr_Format(PyExc_TypeError,
2527 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002528 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002529 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002530}
2531
Alexander Belopolsky40018472011-02-26 01:02:56 +00002532PyObject *
2533PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002534 const char *encoding,
2535 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002536{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002537 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002538 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002539
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002541 PyErr_BadInternalCall();
2542 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002544
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002545 /* Decoding bytes objects is the most common case and should be fast */
2546 if (PyBytes_Check(obj)) {
2547 if (PyBytes_GET_SIZE(obj) == 0) {
2548 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002549 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002550 }
2551 else {
2552 v = PyUnicode_Decode(
2553 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2554 encoding, errors);
2555 }
2556 return v;
2557 }
2558
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002559 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002560 PyErr_SetString(PyExc_TypeError,
2561 "decoding str is not supported");
2562 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002564
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002565 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2566 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2567 PyErr_Format(PyExc_TypeError,
2568 "coercing to str: need bytes, bytearray "
2569 "or buffer-like object, %.80s found",
2570 Py_TYPE(obj)->tp_name);
2571 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002572 }
Tim Petersced69f82003-09-16 20:30:58 +00002573
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002574 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002575 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002576 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577 }
Tim Petersced69f82003-09-16 20:30:58 +00002578 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002579 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002580
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002581 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002582 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583}
2584
Victor Stinner600d3be2010-06-10 12:00:55 +00002585/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002586 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2587 1 on success. */
2588static int
2589normalize_encoding(const char *encoding,
2590 char *lower,
2591 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002593 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002594 char *l;
2595 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002596
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002597 e = encoding;
2598 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002599 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002600 while (*e) {
2601 if (l == l_end)
2602 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002603 if (Py_ISUPPER(*e)) {
2604 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002605 }
2606 else if (*e == '_') {
2607 *l++ = '-';
2608 e++;
2609 }
2610 else {
2611 *l++ = *e++;
2612 }
2613 }
2614 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002615 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002616}
2617
Alexander Belopolsky40018472011-02-26 01:02:56 +00002618PyObject *
2619PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002620 Py_ssize_t size,
2621 const char *encoding,
2622 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002623{
2624 PyObject *buffer = NULL, *unicode;
2625 Py_buffer info;
2626 char lower[11]; /* Enough for any encoding shortcut */
2627
2628 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002629 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002630
2631 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002632 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002633 if ((strcmp(lower, "utf-8") == 0) ||
2634 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002635 return PyUnicode_DecodeUTF8(s, size, errors);
2636 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002637 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002638 (strcmp(lower, "iso-8859-1") == 0))
2639 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002640#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002641 else if (strcmp(lower, "mbcs") == 0)
2642 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002643#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002644 else if (strcmp(lower, "ascii") == 0)
2645 return PyUnicode_DecodeASCII(s, size, errors);
2646 else if (strcmp(lower, "utf-16") == 0)
2647 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2648 else if (strcmp(lower, "utf-32") == 0)
2649 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651
2652 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002653 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002654 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002655 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002656 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657 if (buffer == NULL)
2658 goto onError;
2659 unicode = PyCodec_Decode(buffer, encoding, errors);
2660 if (unicode == NULL)
2661 goto onError;
2662 if (!PyUnicode_Check(unicode)) {
2663 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002664 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002665 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 Py_DECREF(unicode);
2667 goto onError;
2668 }
2669 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002670#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002671 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 Py_DECREF(unicode);
2673 return NULL;
2674 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002675#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002677
Benjamin Peterson29060642009-01-31 22:14:21 +00002678 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679 Py_XDECREF(buffer);
2680 return NULL;
2681}
2682
Alexander Belopolsky40018472011-02-26 01:02:56 +00002683PyObject *
2684PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002685 const char *encoding,
2686 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002687{
2688 PyObject *v;
2689
2690 if (!PyUnicode_Check(unicode)) {
2691 PyErr_BadArgument();
2692 goto onError;
2693 }
2694
2695 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002696 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002697
2698 /* Decode via the codec registry */
2699 v = PyCodec_Decode(unicode, encoding, errors);
2700 if (v == NULL)
2701 goto onError;
2702 return v;
2703
Benjamin Peterson29060642009-01-31 22:14:21 +00002704 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002705 return NULL;
2706}
2707
Alexander Belopolsky40018472011-02-26 01:02:56 +00002708PyObject *
2709PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002710 const char *encoding,
2711 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002712{
2713 PyObject *v;
2714
2715 if (!PyUnicode_Check(unicode)) {
2716 PyErr_BadArgument();
2717 goto onError;
2718 }
2719
2720 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002721 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002722
2723 /* Decode via the codec registry */
2724 v = PyCodec_Decode(unicode, encoding, errors);
2725 if (v == NULL)
2726 goto onError;
2727 if (!PyUnicode_Check(v)) {
2728 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002729 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002730 Py_TYPE(v)->tp_name);
2731 Py_DECREF(v);
2732 goto onError;
2733 }
2734 return v;
2735
Benjamin Peterson29060642009-01-31 22:14:21 +00002736 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002737 return NULL;
2738}
2739
Alexander Belopolsky40018472011-02-26 01:02:56 +00002740PyObject *
2741PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002742 Py_ssize_t size,
2743 const char *encoding,
2744 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745{
2746 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002747
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748 unicode = PyUnicode_FromUnicode(s, size);
2749 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002750 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2752 Py_DECREF(unicode);
2753 return v;
2754}
2755
Alexander Belopolsky40018472011-02-26 01:02:56 +00002756PyObject *
2757PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002758 const char *encoding,
2759 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002760{
2761 PyObject *v;
2762
2763 if (!PyUnicode_Check(unicode)) {
2764 PyErr_BadArgument();
2765 goto onError;
2766 }
2767
2768 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002769 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002770
2771 /* Encode via the codec registry */
2772 v = PyCodec_Encode(unicode, encoding, errors);
2773 if (v == NULL)
2774 goto onError;
2775 return v;
2776
Benjamin Peterson29060642009-01-31 22:14:21 +00002777 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002778 return NULL;
2779}
2780
Victor Stinnerad158722010-10-27 00:25:46 +00002781PyObject *
2782PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002783{
Victor Stinner99b95382011-07-04 14:23:54 +02002784#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002785 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2786 PyUnicode_GET_SIZE(unicode),
2787 NULL);
2788#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002790#else
Victor Stinner793b5312011-04-27 00:24:21 +02002791 PyInterpreterState *interp = PyThreadState_GET()->interp;
2792 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2793 cannot use it to encode and decode filenames before it is loaded. Load
2794 the Python codec requires to encode at least its own filename. Use the C
2795 version of the locale codec until the codec registry is initialized and
2796 the Python codec is loaded.
2797
2798 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2799 cannot only rely on it: check also interp->fscodec_initialized for
2800 subinterpreters. */
2801 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002802 return PyUnicode_AsEncodedString(unicode,
2803 Py_FileSystemDefaultEncoding,
2804 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002805 }
2806 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002807 /* locale encoding with surrogateescape */
2808 wchar_t *wchar;
2809 char *bytes;
2810 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002811 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002812
2813 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2814 if (wchar == NULL)
2815 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002816 bytes = _Py_wchar2char(wchar, &error_pos);
2817 if (bytes == NULL) {
2818 if (error_pos != (size_t)-1) {
2819 char *errmsg = strerror(errno);
2820 PyObject *exc = NULL;
2821 if (errmsg == NULL)
2822 errmsg = "Py_wchar2char() failed";
2823 raise_encode_exception(&exc,
2824 "filesystemencoding",
2825 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2826 error_pos, error_pos+1,
2827 errmsg);
2828 Py_XDECREF(exc);
2829 }
2830 else
2831 PyErr_NoMemory();
2832 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002833 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002834 }
2835 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002836
2837 bytes_obj = PyBytes_FromString(bytes);
2838 PyMem_Free(bytes);
2839 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002840 }
Victor Stinnerad158722010-10-27 00:25:46 +00002841#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002842}
2843
Alexander Belopolsky40018472011-02-26 01:02:56 +00002844PyObject *
2845PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002846 const char *encoding,
2847 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848{
2849 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002850 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002851
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 if (!PyUnicode_Check(unicode)) {
2853 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 }
Fred Drakee4315f52000-05-09 19:53:39 +00002856
Victor Stinner2f283c22011-03-02 01:21:46 +00002857 if (encoding == NULL) {
2858 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002859 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002860 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002861 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002862 }
Fred Drakee4315f52000-05-09 19:53:39 +00002863
2864 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002865 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002866 if ((strcmp(lower, "utf-8") == 0) ||
2867 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002868 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002869 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002870 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002871 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002872 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002873 }
Victor Stinner37296e82010-06-10 13:36:23 +00002874 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002875 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002876 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002877 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002878#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002879 else if (strcmp(lower, "mbcs") == 0)
2880 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2881 PyUnicode_GET_SIZE(unicode),
2882 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002883#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002884 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002885 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887
2888 /* Encode via the codec registry */
2889 v = PyCodec_Encode(unicode, encoding, errors);
2890 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002891 return NULL;
2892
2893 /* The normal path */
2894 if (PyBytes_Check(v))
2895 return v;
2896
2897 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002898 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002899 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002900 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002901
2902 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2903 "encoder %s returned bytearray instead of bytes",
2904 encoding);
2905 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002906 Py_DECREF(v);
2907 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002908 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002909
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002910 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2911 Py_DECREF(v);
2912 return b;
2913 }
2914
2915 PyErr_Format(PyExc_TypeError,
2916 "encoder did not return a bytes object (type=%.400s)",
2917 Py_TYPE(v)->tp_name);
2918 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002919 return NULL;
2920}
2921
Alexander Belopolsky40018472011-02-26 01:02:56 +00002922PyObject *
2923PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002924 const char *encoding,
2925 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002926{
2927 PyObject *v;
2928
2929 if (!PyUnicode_Check(unicode)) {
2930 PyErr_BadArgument();
2931 goto onError;
2932 }
2933
2934 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002935 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002936
2937 /* Encode via the codec registry */
2938 v = PyCodec_Encode(unicode, encoding, errors);
2939 if (v == NULL)
2940 goto onError;
2941 if (!PyUnicode_Check(v)) {
2942 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002943 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002944 Py_TYPE(v)->tp_name);
2945 Py_DECREF(v);
2946 goto onError;
2947 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002949
Benjamin Peterson29060642009-01-31 22:14:21 +00002950 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 return NULL;
2952}
2953
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002954PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002955PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002956 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002957 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2958}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002959
Christian Heimes5894ba72007-11-04 11:43:14 +00002960PyObject*
2961PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2962{
Victor Stinner99b95382011-07-04 14:23:54 +02002963#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002964 return PyUnicode_DecodeMBCS(s, size, NULL);
2965#elif defined(__APPLE__)
2966 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2967#else
Victor Stinner793b5312011-04-27 00:24:21 +02002968 PyInterpreterState *interp = PyThreadState_GET()->interp;
2969 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2970 cannot use it to encode and decode filenames before it is loaded. Load
2971 the Python codec requires to encode at least its own filename. Use the C
2972 version of the locale codec until the codec registry is initialized and
2973 the Python codec is loaded.
2974
2975 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2976 cannot only rely on it: check also interp->fscodec_initialized for
2977 subinterpreters. */
2978 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002979 return PyUnicode_Decode(s, size,
2980 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002981 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002982 }
2983 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002984 /* locale encoding with surrogateescape */
2985 wchar_t *wchar;
2986 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002987 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002988
2989 if (s[size] != '\0' || size != strlen(s)) {
2990 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2991 return NULL;
2992 }
2993
Victor Stinner168e1172010-10-16 23:16:16 +00002994 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002995 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002996 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002997
Victor Stinner168e1172010-10-16 23:16:16 +00002998 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002999 PyMem_Free(wchar);
3000 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003001 }
Victor Stinnerad158722010-10-27 00:25:46 +00003002#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003003}
3004
Martin v. Löwis011e8422009-05-05 04:43:17 +00003005
3006int
3007PyUnicode_FSConverter(PyObject* arg, void* addr)
3008{
3009 PyObject *output = NULL;
3010 Py_ssize_t size;
3011 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003012 if (arg == NULL) {
3013 Py_DECREF(*(PyObject**)addr);
3014 return 1;
3015 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003016 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003017 output = arg;
3018 Py_INCREF(output);
3019 }
3020 else {
3021 arg = PyUnicode_FromObject(arg);
3022 if (!arg)
3023 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003024 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003025 Py_DECREF(arg);
3026 if (!output)
3027 return 0;
3028 if (!PyBytes_Check(output)) {
3029 Py_DECREF(output);
3030 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3031 return 0;
3032 }
3033 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003034 size = PyBytes_GET_SIZE(output);
3035 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003036 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003037 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003038 Py_DECREF(output);
3039 return 0;
3040 }
3041 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003042 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003043}
3044
3045
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003046int
3047PyUnicode_FSDecoder(PyObject* arg, void* addr)
3048{
3049 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003050 if (arg == NULL) {
3051 Py_DECREF(*(PyObject**)addr);
3052 return 1;
3053 }
3054 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003055 if (PyUnicode_READY(arg))
3056 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003057 output = arg;
3058 Py_INCREF(output);
3059 }
3060 else {
3061 arg = PyBytes_FromObject(arg);
3062 if (!arg)
3063 return 0;
3064 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3065 PyBytes_GET_SIZE(arg));
3066 Py_DECREF(arg);
3067 if (!output)
3068 return 0;
3069 if (!PyUnicode_Check(output)) {
3070 Py_DECREF(output);
3071 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3072 return 0;
3073 }
3074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003075 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3076 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003077 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3078 Py_DECREF(output);
3079 return 0;
3080 }
3081 *(PyObject**)addr = output;
3082 return Py_CLEANUP_SUPPORTED;
3083}
3084
3085
Martin v. Löwis5b222132007-06-10 09:51:05 +00003086char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003087PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003088{
Christian Heimesf3863112007-11-22 07:46:41 +00003089 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003090 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3091
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003092 if (!PyUnicode_Check(unicode)) {
3093 PyErr_BadArgument();
3094 return NULL;
3095 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003096 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003097 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003098
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003099 if (PyUnicode_UTF8(unicode) == NULL) {
3100 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003101 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3102 if (bytes == NULL)
3103 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003104 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3105 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003106 Py_DECREF(bytes);
3107 return NULL;
3108 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003109 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3110 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003111 Py_DECREF(bytes);
3112 }
3113
3114 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003115 *psize = PyUnicode_UTF8_LENGTH(unicode);
3116 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003117}
3118
3119char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003120PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003121{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003122 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3123}
3124
3125#ifdef Py_DEBUG
3126int unicode_as_unicode_calls = 0;
3127#endif
3128
3129
3130Py_UNICODE *
3131PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3132{
3133 PyUnicodeObject *u;
3134 const unsigned char *one_byte;
3135#if SIZEOF_WCHAR_T == 4
3136 const Py_UCS2 *two_bytes;
3137#else
3138 const Py_UCS4 *four_bytes;
3139 const Py_UCS4 *ucs4_end;
3140 Py_ssize_t num_surrogates;
3141#endif
3142 wchar_t *w;
3143 wchar_t *wchar_end;
3144
3145 if (!PyUnicode_Check(unicode)) {
3146 PyErr_BadArgument();
3147 return NULL;
3148 }
3149 u = (PyUnicodeObject*)unicode;
3150 if (_PyUnicode_WSTR(u) == NULL) {
3151 /* Non-ASCII compact unicode object */
3152 assert(_PyUnicode_KIND(u) != 0);
3153 assert(PyUnicode_IS_READY(u));
3154
3155#ifdef Py_DEBUG
3156 ++unicode_as_unicode_calls;
3157#endif
3158
3159 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3160#if SIZEOF_WCHAR_T == 2
3161 four_bytes = PyUnicode_4BYTE_DATA(u);
3162 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3163 num_surrogates = 0;
3164
3165 for (; four_bytes < ucs4_end; ++four_bytes) {
3166 if (*four_bytes > 0xFFFF)
3167 ++num_surrogates;
3168 }
3169
3170 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3171 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3172 if (!_PyUnicode_WSTR(u)) {
3173 PyErr_NoMemory();
3174 return NULL;
3175 }
3176 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3177
3178 w = _PyUnicode_WSTR(u);
3179 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3180 four_bytes = PyUnicode_4BYTE_DATA(u);
3181 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3182 if (*four_bytes > 0xFFFF) {
3183 /* encode surrogate pair in this case */
3184 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3185 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3186 }
3187 else
3188 *w = *four_bytes;
3189
3190 if (w > wchar_end) {
3191 assert(0 && "Miscalculated string end");
3192 }
3193 }
3194 *w = 0;
3195#else
3196 /* sizeof(wchar_t) == 4 */
3197 Py_FatalError("Impossible unicode object state, wstr and str "
3198 "should share memory already.");
3199 return NULL;
3200#endif
3201 }
3202 else {
3203 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3204 (_PyUnicode_LENGTH(u) + 1));
3205 if (!_PyUnicode_WSTR(u)) {
3206 PyErr_NoMemory();
3207 return NULL;
3208 }
3209 if (!PyUnicode_IS_COMPACT_ASCII(u))
3210 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3211 w = _PyUnicode_WSTR(u);
3212 wchar_end = w + _PyUnicode_LENGTH(u);
3213
3214 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3215 one_byte = PyUnicode_1BYTE_DATA(u);
3216 for (; w < wchar_end; ++one_byte, ++w)
3217 *w = *one_byte;
3218 /* null-terminate the wstr */
3219 *w = 0;
3220 }
3221 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3222#if SIZEOF_WCHAR_T == 4
3223 two_bytes = PyUnicode_2BYTE_DATA(u);
3224 for (; w < wchar_end; ++two_bytes, ++w)
3225 *w = *two_bytes;
3226 /* null-terminate the wstr */
3227 *w = 0;
3228#else
3229 /* sizeof(wchar_t) == 2 */
3230 PyObject_FREE(_PyUnicode_WSTR(u));
3231 _PyUnicode_WSTR(u) = NULL;
3232 Py_FatalError("Impossible unicode object state, wstr "
3233 "and str should share memory already.");
3234 return NULL;
3235#endif
3236 }
3237 else {
3238 assert(0 && "This should never happen.");
3239 }
3240 }
3241 }
3242 if (size != NULL)
3243 *size = PyUnicode_WSTR_LENGTH(u);
3244 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003245}
3246
Alexander Belopolsky40018472011-02-26 01:02:56 +00003247Py_UNICODE *
3248PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003250 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251}
3252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003253
Alexander Belopolsky40018472011-02-26 01:02:56 +00003254Py_ssize_t
3255PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256{
3257 if (!PyUnicode_Check(unicode)) {
3258 PyErr_BadArgument();
3259 goto onError;
3260 }
3261 return PyUnicode_GET_SIZE(unicode);
3262
Benjamin Peterson29060642009-01-31 22:14:21 +00003263 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 return -1;
3265}
3266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003267Py_ssize_t
3268PyUnicode_GetLength(PyObject *unicode)
3269{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003270 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003271 PyErr_BadArgument();
3272 return -1;
3273 }
3274
3275 return PyUnicode_GET_LENGTH(unicode);
3276}
3277
3278Py_UCS4
3279PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3280{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003281 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3282 PyErr_BadArgument();
3283 return (Py_UCS4)-1;
3284 }
3285 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3286 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003287 return (Py_UCS4)-1;
3288 }
3289 return PyUnicode_READ_CHAR(unicode, index);
3290}
3291
3292int
3293PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3294{
3295 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003296 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003297 return -1;
3298 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003299 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3300 PyErr_SetString(PyExc_IndexError, "string index out of range");
3301 return -1;
3302 }
3303 if (_PyUnicode_Dirty(unicode))
3304 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003305 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3306 index, ch);
3307 return 0;
3308}
3309
Alexander Belopolsky40018472011-02-26 01:02:56 +00003310const char *
3311PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003312{
Victor Stinner42cb4622010-09-01 19:39:01 +00003313 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003314}
3315
Victor Stinner554f3f02010-06-16 23:33:54 +00003316/* create or adjust a UnicodeDecodeError */
3317static void
3318make_decode_exception(PyObject **exceptionObject,
3319 const char *encoding,
3320 const char *input, Py_ssize_t length,
3321 Py_ssize_t startpos, Py_ssize_t endpos,
3322 const char *reason)
3323{
3324 if (*exceptionObject == NULL) {
3325 *exceptionObject = PyUnicodeDecodeError_Create(
3326 encoding, input, length, startpos, endpos, reason);
3327 }
3328 else {
3329 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3330 goto onError;
3331 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3332 goto onError;
3333 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3334 goto onError;
3335 }
3336 return;
3337
3338onError:
3339 Py_DECREF(*exceptionObject);
3340 *exceptionObject = NULL;
3341}
3342
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343/* error handling callback helper:
3344 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003345 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 and adjust various state variables.
3347 return 0 on success, -1 on error
3348*/
3349
Alexander Belopolsky40018472011-02-26 01:02:56 +00003350static int
3351unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003352 const char *encoding, const char *reason,
3353 const char **input, const char **inend, Py_ssize_t *startinpos,
3354 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3355 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003356{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003357 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003358
3359 PyObject *restuple = NULL;
3360 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003361 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003362 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003363 Py_ssize_t requiredsize;
3364 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003365 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003366 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003367 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003368 int res = -1;
3369
3370 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003371 *errorHandler = PyCodec_LookupError(errors);
3372 if (*errorHandler == NULL)
3373 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 }
3375
Victor Stinner554f3f02010-06-16 23:33:54 +00003376 make_decode_exception(exceptionObject,
3377 encoding,
3378 *input, *inend - *input,
3379 *startinpos, *endinpos,
3380 reason);
3381 if (*exceptionObject == NULL)
3382 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003383
3384 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3385 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003386 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003388 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003389 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 }
3391 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003392 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003393
3394 /* Copy back the bytes variables, which might have been modified by the
3395 callback */
3396 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3397 if (!inputobj)
3398 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003399 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003400 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003401 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003402 *input = PyBytes_AS_STRING(inputobj);
3403 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003404 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003405 /* we can DECREF safely, as the exception has another reference,
3406 so the object won't go away. */
3407 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003411 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003412 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3413 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003414 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415
3416 /* need more space? (at least enough for what we
3417 have+the replacement+the rest of the string (starting
3418 at the new input position), so we won't have to check space
3419 when there are no errors in the rest of the string) */
3420 repptr = PyUnicode_AS_UNICODE(repunicode);
3421 repsize = PyUnicode_GET_SIZE(repunicode);
3422 requiredsize = *outpos + repsize + insize-newpos;
3423 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003424 if (requiredsize<2*outsize)
3425 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003426 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003427 goto onError;
3428 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 }
3430 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003431 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003432 Py_UNICODE_COPY(*outptr, repptr, repsize);
3433 *outptr += repsize;
3434 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 /* we made it! */
3437 res = 0;
3438
Benjamin Peterson29060642009-01-31 22:14:21 +00003439 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440 Py_XDECREF(restuple);
3441 return res;
3442}
3443
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003444/* --- UTF-7 Codec -------------------------------------------------------- */
3445
Antoine Pitrou244651a2009-05-04 18:56:13 +00003446/* See RFC2152 for details. We encode conservatively and decode liberally. */
3447
3448/* Three simple macros defining base-64. */
3449
3450/* Is c a base-64 character? */
3451
3452#define IS_BASE64(c) \
3453 (((c) >= 'A' && (c) <= 'Z') || \
3454 ((c) >= 'a' && (c) <= 'z') || \
3455 ((c) >= '0' && (c) <= '9') || \
3456 (c) == '+' || (c) == '/')
3457
3458/* given that c is a base-64 character, what is its base-64 value? */
3459
3460#define FROM_BASE64(c) \
3461 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3462 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3463 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3464 (c) == '+' ? 62 : 63)
3465
3466/* What is the base-64 character of the bottom 6 bits of n? */
3467
3468#define TO_BASE64(n) \
3469 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3470
3471/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3472 * decoded as itself. We are permissive on decoding; the only ASCII
3473 * byte not decoding to itself is the + which begins a base64
3474 * string. */
3475
3476#define DECODE_DIRECT(c) \
3477 ((c) <= 127 && (c) != '+')
3478
3479/* The UTF-7 encoder treats ASCII characters differently according to
3480 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3481 * the above). See RFC2152. This array identifies these different
3482 * sets:
3483 * 0 : "Set D"
3484 * alphanumeric and '(),-./:?
3485 * 1 : "Set O"
3486 * !"#$%&*;<=>@[]^_`{|}
3487 * 2 : "whitespace"
3488 * ht nl cr sp
3489 * 3 : special (must be base64 encoded)
3490 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3491 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003492
Tim Petersced69f82003-09-16 20:30:58 +00003493static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003494char utf7_category[128] = {
3495/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3496 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3497/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3498 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3499/* sp ! " # $ % & ' ( ) * + , - . / */
3500 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3501/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3503/* @ A B C D E F G H I J K L M N O */
3504 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3505/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3506 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3507/* ` a b c d e f g h i j k l m n o */
3508 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3509/* p q r s t u v w x y z { | } ~ del */
3510 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003511};
3512
Antoine Pitrou244651a2009-05-04 18:56:13 +00003513/* ENCODE_DIRECT: this character should be encoded as itself. The
3514 * answer depends on whether we are encoding set O as itself, and also
3515 * on whether we are encoding whitespace as itself. RFC2152 makes it
3516 * clear that the answers to these questions vary between
3517 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003518
Antoine Pitrou244651a2009-05-04 18:56:13 +00003519#define ENCODE_DIRECT(c, directO, directWS) \
3520 ((c) < 128 && (c) > 0 && \
3521 ((utf7_category[(c)] == 0) || \
3522 (directWS && (utf7_category[(c)] == 2)) || \
3523 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003524
Alexander Belopolsky40018472011-02-26 01:02:56 +00003525PyObject *
3526PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003527 Py_ssize_t size,
3528 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003529{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003530 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3531}
3532
Antoine Pitrou244651a2009-05-04 18:56:13 +00003533/* The decoder. The only state we preserve is our read position,
3534 * i.e. how many characters we have consumed. So if we end in the
3535 * middle of a shift sequence we have to back off the read position
3536 * and the output to the beginning of the sequence, otherwise we lose
3537 * all the shift state (seen bits, number of bits seen, high
3538 * surrogate). */
3539
Alexander Belopolsky40018472011-02-26 01:02:56 +00003540PyObject *
3541PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003542 Py_ssize_t size,
3543 const char *errors,
3544 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003545{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003547 Py_ssize_t startinpos;
3548 Py_ssize_t endinpos;
3549 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003550 const char *e;
3551 PyUnicodeObject *unicode;
3552 Py_UNICODE *p;
3553 const char *errmsg = "";
3554 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003555 Py_UNICODE *shiftOutStart;
3556 unsigned int base64bits = 0;
3557 unsigned long base64buffer = 0;
3558 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 PyObject *errorHandler = NULL;
3560 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003561
3562 unicode = _PyUnicode_New(size);
3563 if (!unicode)
3564 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003565 if (size == 0) {
3566 if (consumed)
3567 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003568 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003569 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003571 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003572 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003573 e = s + size;
3574
3575 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003577 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003578 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003579
Antoine Pitrou244651a2009-05-04 18:56:13 +00003580 if (inShift) { /* in a base-64 section */
3581 if (IS_BASE64(ch)) { /* consume a base-64 character */
3582 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3583 base64bits += 6;
3584 s++;
3585 if (base64bits >= 16) {
3586 /* we have enough bits for a UTF-16 value */
3587 Py_UNICODE outCh = (Py_UNICODE)
3588 (base64buffer >> (base64bits-16));
3589 base64bits -= 16;
3590 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3591 if (surrogate) {
3592 /* expecting a second surrogate */
3593 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3594#ifdef Py_UNICODE_WIDE
3595 *p++ = (((surrogate & 0x3FF)<<10)
3596 | (outCh & 0x3FF)) + 0x10000;
3597#else
3598 *p++ = surrogate;
3599 *p++ = outCh;
3600#endif
3601 surrogate = 0;
3602 }
3603 else {
3604 surrogate = 0;
3605 errmsg = "second surrogate missing";
3606 goto utf7Error;
3607 }
3608 }
3609 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3610 /* first surrogate */
3611 surrogate = outCh;
3612 }
3613 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3614 errmsg = "unexpected second surrogate";
3615 goto utf7Error;
3616 }
3617 else {
3618 *p++ = outCh;
3619 }
3620 }
3621 }
3622 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003623 inShift = 0;
3624 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003625 if (surrogate) {
3626 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003627 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003628 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003629 if (base64bits > 0) { /* left-over bits */
3630 if (base64bits >= 6) {
3631 /* We've seen at least one base-64 character */
3632 errmsg = "partial character in shift sequence";
3633 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003634 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003635 else {
3636 /* Some bits remain; they should be zero */
3637 if (base64buffer != 0) {
3638 errmsg = "non-zero padding bits in shift sequence";
3639 goto utf7Error;
3640 }
3641 }
3642 }
3643 if (ch != '-') {
3644 /* '-' is absorbed; other terminating
3645 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003646 *p++ = ch;
3647 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003648 }
3649 }
3650 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003652 s++; /* consume '+' */
3653 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003654 s++;
3655 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003656 }
3657 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003658 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003659 shiftOutStart = p;
3660 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003661 }
3662 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003663 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003664 *p++ = ch;
3665 s++;
3666 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003667 else {
3668 startinpos = s-starts;
3669 s++;
3670 errmsg = "unexpected special character";
3671 goto utf7Error;
3672 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003673 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003674utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 outpos = p-PyUnicode_AS_UNICODE(unicode);
3676 endinpos = s-starts;
3677 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 errors, &errorHandler,
3679 "utf7", errmsg,
3680 &starts, &e, &startinpos, &endinpos, &exc, &s,
3681 &unicode, &outpos, &p))
3682 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003683 }
3684
Antoine Pitrou244651a2009-05-04 18:56:13 +00003685 /* end of string */
3686
3687 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3688 /* if we're in an inconsistent state, that's an error */
3689 if (surrogate ||
3690 (base64bits >= 6) ||
3691 (base64bits > 0 && base64buffer != 0)) {
3692 outpos = p-PyUnicode_AS_UNICODE(unicode);
3693 endinpos = size;
3694 if (unicode_decode_call_errorhandler(
3695 errors, &errorHandler,
3696 "utf7", "unterminated shift sequence",
3697 &starts, &e, &startinpos, &endinpos, &exc, &s,
3698 &unicode, &outpos, &p))
3699 goto onError;
3700 if (s < e)
3701 goto restart;
3702 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003703 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003704
3705 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003706 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003707 if (inShift) {
3708 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003709 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003710 }
3711 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003712 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003713 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003715
Victor Stinnerfe226c02011-10-03 03:52:20 +02003716 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003717 goto onError;
3718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 Py_XDECREF(errorHandler);
3720 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003721#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003722 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003723 Py_DECREF(unicode);
3724 return NULL;
3725 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003726#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003727 return (PyObject *)unicode;
3728
Benjamin Peterson29060642009-01-31 22:14:21 +00003729 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 Py_XDECREF(errorHandler);
3731 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003732 Py_DECREF(unicode);
3733 return NULL;
3734}
3735
3736
Alexander Belopolsky40018472011-02-26 01:02:56 +00003737PyObject *
3738PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003739 Py_ssize_t size,
3740 int base64SetO,
3741 int base64WhiteSpace,
3742 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003743{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003744 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003745 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003746 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003747 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003748 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003749 unsigned int base64bits = 0;
3750 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003751 char * out;
3752 char * start;
3753
3754 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003755 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003756
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003757 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003758 return PyErr_NoMemory();
3759
Antoine Pitrou244651a2009-05-04 18:56:13 +00003760 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003761 if (v == NULL)
3762 return NULL;
3763
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003764 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003765 for (;i < size; ++i) {
3766 Py_UNICODE ch = s[i];
3767
Antoine Pitrou244651a2009-05-04 18:56:13 +00003768 if (inShift) {
3769 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3770 /* shifting out */
3771 if (base64bits) { /* output remaining bits */
3772 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3773 base64buffer = 0;
3774 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003775 }
3776 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003777 /* Characters not in the BASE64 set implicitly unshift the sequence
3778 so no '-' is required, except if the character is itself a '-' */
3779 if (IS_BASE64(ch) || ch == '-') {
3780 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003781 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003782 *out++ = (char) ch;
3783 }
3784 else {
3785 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003786 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003787 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003788 else { /* not in a shift sequence */
3789 if (ch == '+') {
3790 *out++ = '+';
3791 *out++ = '-';
3792 }
3793 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3794 *out++ = (char) ch;
3795 }
3796 else {
3797 *out++ = '+';
3798 inShift = 1;
3799 goto encode_char;
3800 }
3801 }
3802 continue;
3803encode_char:
3804#ifdef Py_UNICODE_WIDE
3805 if (ch >= 0x10000) {
3806 /* code first surrogate */
3807 base64bits += 16;
3808 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3809 while (base64bits >= 6) {
3810 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3811 base64bits -= 6;
3812 }
3813 /* prepare second surrogate */
3814 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3815 }
3816#endif
3817 base64bits += 16;
3818 base64buffer = (base64buffer << 16) | ch;
3819 while (base64bits >= 6) {
3820 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3821 base64bits -= 6;
3822 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003823 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003824 if (base64bits)
3825 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3826 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003827 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003828 if (_PyBytes_Resize(&v, out - start) < 0)
3829 return NULL;
3830 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003831}
3832
Antoine Pitrou244651a2009-05-04 18:56:13 +00003833#undef IS_BASE64
3834#undef FROM_BASE64
3835#undef TO_BASE64
3836#undef DECODE_DIRECT
3837#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003838
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839/* --- UTF-8 Codec -------------------------------------------------------- */
3840
Tim Petersced69f82003-09-16 20:30:58 +00003841static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003843 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3844 illegal prefix. See RFC 3629 for details */
3845 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3846 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003847 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3849 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3850 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3851 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003852 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003856 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3857 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3858 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3859 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3860 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861};
3862
Alexander Belopolsky40018472011-02-26 01:02:56 +00003863PyObject *
3864PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003865 Py_ssize_t size,
3866 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867{
Walter Dörwald69652032004-09-07 20:24:22 +00003868 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3869}
3870
Antoine Pitrouab868312009-01-10 15:40:25 +00003871/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3872#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3873
3874/* Mask to quickly check whether a C 'long' contains a
3875 non-ASCII, UTF8-encoded char. */
3876#if (SIZEOF_LONG == 8)
3877# define ASCII_CHAR_MASK 0x8080808080808080L
3878#elif (SIZEOF_LONG == 4)
3879# define ASCII_CHAR_MASK 0x80808080L
3880#else
3881# error C 'long' size should be either 4 or 8!
3882#endif
3883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884/* Scans a UTF-8 string and returns the maximum character to be expected,
3885 the size of the decoded unicode string and if any major errors were
3886 encountered.
3887
3888 This function does check basic UTF-8 sanity, it does however NOT CHECK
3889 if the string contains surrogates, and if all continuation bytes are
3890 within the correct ranges, these checks are performed in
3891 PyUnicode_DecodeUTF8Stateful.
3892
3893 If it sets has_errors to 1, it means the value of unicode_size and max_char
3894 will be bogus and you should not rely on useful information in them.
3895 */
3896static Py_UCS4
3897utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3898 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3899 int *has_errors)
3900{
3901 Py_ssize_t n;
3902 Py_ssize_t char_count = 0;
3903 Py_UCS4 max_char = 127, new_max;
3904 Py_UCS4 upper_bound;
3905 const unsigned char *p = (const unsigned char *)s;
3906 const unsigned char *end = p + string_size;
3907 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3908 int err = 0;
3909
3910 for (; p < end && !err; ++p, ++char_count) {
3911 /* Only check value if it's not a ASCII char... */
3912 if (*p < 0x80) {
3913 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3914 an explanation. */
3915 if (!((size_t) p & LONG_PTR_MASK)) {
3916 /* Help register allocation */
3917 register const unsigned char *_p = p;
3918 while (_p < aligned_end) {
3919 unsigned long value = *(unsigned long *) _p;
3920 if (value & ASCII_CHAR_MASK)
3921 break;
3922 _p += SIZEOF_LONG;
3923 char_count += SIZEOF_LONG;
3924 }
3925 p = _p;
3926 if (p == end)
3927 break;
3928 }
3929 }
3930 if (*p >= 0x80) {
3931 n = utf8_code_length[*p];
3932 new_max = max_char;
3933 switch (n) {
3934 /* invalid start byte */
3935 case 0:
3936 err = 1;
3937 break;
3938 case 2:
3939 /* Code points between 0x00FF and 0x07FF inclusive.
3940 Approximate the upper bound of the code point,
3941 if this flips over 255 we can be sure it will be more
3942 than 255 and the string will need 2 bytes per code coint,
3943 if it stays under or equal to 255, we can be sure 1 byte
3944 is enough.
3945 ((*p & 0b00011111) << 6) | 0b00111111 */
3946 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3947 if (max_char < upper_bound)
3948 new_max = upper_bound;
3949 /* Ensure we track at least that we left ASCII space. */
3950 if (new_max < 128)
3951 new_max = 128;
3952 break;
3953 case 3:
3954 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3955 always > 255 and <= 65535 and will always need 2 bytes. */
3956 if (max_char < 65535)
3957 new_max = 65535;
3958 break;
3959 case 4:
3960 /* Code point will be above 0xFFFF for sure in this case. */
3961 new_max = 65537;
3962 break;
3963 /* Internal error, this should be caught by the first if */
3964 case 1:
3965 default:
3966 assert(0 && "Impossible case in utf8_max_char_and_size");
3967 err = 1;
3968 }
3969 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02003970 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971 --n;
3972 /* Check if the follow up chars are all valid continuation bytes */
3973 if (n >= 1) {
3974 const unsigned char *cont;
3975 if ((p + n) >= end) {
3976 if (consumed == 0)
3977 /* incomplete data, non-incremental decoding */
3978 err = 1;
3979 break;
3980 }
3981 for (cont = p + 1; cont < (p + n); ++cont) {
3982 if ((*cont & 0xc0) != 0x80) {
3983 err = 1;
3984 break;
3985 }
3986 }
3987 p += n;
3988 }
3989 else
3990 err = 1;
3991 max_char = new_max;
3992 }
3993 }
3994
3995 if (unicode_size)
3996 *unicode_size = char_count;
3997 if (has_errors)
3998 *has_errors = err;
3999 return max_char;
4000}
4001
4002/* Similar to PyUnicode_WRITE but can also write into wstr field
4003 of the legacy unicode representation */
4004#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4005 do { \
4006 const int k_ = (kind); \
4007 if (k_ == PyUnicode_WCHAR_KIND) \
4008 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4009 else if (k_ == PyUnicode_1BYTE_KIND) \
4010 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4011 else if (k_ == PyUnicode_2BYTE_KIND) \
4012 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4013 else \
4014 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4015 } while (0)
4016
Alexander Belopolsky40018472011-02-26 01:02:56 +00004017PyObject *
4018PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 Py_ssize_t size,
4020 const char *errors,
4021 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004022{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004023 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004025 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004026 Py_ssize_t startinpos;
4027 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004028 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004030 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004031 PyObject *errorHandler = NULL;
4032 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033 Py_UCS4 maxchar = 0;
4034 Py_ssize_t unicode_size;
4035 Py_ssize_t i;
4036 int kind;
4037 void *data;
4038 int has_errors;
4039 Py_UNICODE *error_outptr;
4040#if SIZEOF_WCHAR_T == 2
4041 Py_ssize_t wchar_offset = 0;
4042#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004043
Walter Dörwald69652032004-09-07 20:24:22 +00004044 if (size == 0) {
4045 if (consumed)
4046 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004048 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4050 consumed, &has_errors);
4051 if (has_errors) {
4052 unicode = _PyUnicode_New(size);
4053 if (!unicode)
4054 return NULL;
4055 kind = PyUnicode_WCHAR_KIND;
4056 data = PyUnicode_AS_UNICODE(unicode);
4057 assert(data != NULL);
4058 }
4059 else {
4060 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4061 if (!unicode)
4062 return NULL;
4063 /* When the string is ASCII only, just use memcpy and return.
4064 unicode_size may be != size if there is an incomplete UTF-8
4065 sequence at the end of the ASCII block. */
4066 if (maxchar < 128 && size == unicode_size) {
4067 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4068 return (PyObject *)unicode;
4069 }
4070 kind = PyUnicode_KIND(unicode);
4071 data = PyUnicode_DATA(unicode);
4072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004074 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004076 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077
4078 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004079 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080
4081 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004082 /* Fast path for runs of ASCII characters. Given that common UTF-8
4083 input will consist of an overwhelming majority of ASCII
4084 characters, we try to optimize for this case by checking
4085 as many characters as a C 'long' can contain.
4086 First, check if we can do an aligned read, as most CPUs have
4087 a penalty for unaligned reads.
4088 */
4089 if (!((size_t) s & LONG_PTR_MASK)) {
4090 /* Help register allocation */
4091 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004093 while (_s < aligned_end) {
4094 /* Read a whole long at a time (either 4 or 8 bytes),
4095 and do a fast unrolled copy if it only contains ASCII
4096 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097 unsigned long value = *(unsigned long *) _s;
4098 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004099 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004100 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4101 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4102 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4103 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004104#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4106 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4107 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4108 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004109#endif
4110 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004111 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004112 }
4113 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004115 if (s == e)
4116 break;
4117 ch = (unsigned char)*s;
4118 }
4119 }
4120
4121 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004122 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 s++;
4124 continue;
4125 }
4126
4127 n = utf8_code_length[ch];
4128
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004129 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 if (consumed)
4131 break;
4132 else {
4133 errmsg = "unexpected end of data";
4134 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004135 endinpos = startinpos+1;
4136 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4137 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 goto utf8Error;
4139 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141
4142 switch (n) {
4143
4144 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004145 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 startinpos = s-starts;
4147 endinpos = startinpos+1;
4148 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149
4150 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004151 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 startinpos = s-starts;
4153 endinpos = startinpos+1;
4154 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155
4156 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004157 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004158 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004159 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004160 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 goto utf8Error;
4162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004164 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004165 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166 break;
4167
4168 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004169 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4170 will result in surrogates in range d800-dfff. Surrogates are
4171 not valid UTF-8 so they are rejected.
4172 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4173 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004174 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004175 (s[2] & 0xc0) != 0x80 ||
4176 ((unsigned char)s[0] == 0xE0 &&
4177 (unsigned char)s[1] < 0xA0) ||
4178 ((unsigned char)s[0] == 0xED &&
4179 (unsigned char)s[1] > 0x9F)) {
4180 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004182 endinpos = startinpos + 1;
4183
4184 /* if s[1] first two bits are 1 and 0, then the invalid
4185 continuation byte is s[2], so increment endinpos by 1,
4186 if not, s[1] is invalid and endinpos doesn't need to
4187 be incremented. */
4188 if ((s[1] & 0xC0) == 0x80)
4189 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 goto utf8Error;
4191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004193 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004194 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004195 break;
4196
4197 case 4:
4198 if ((s[1] & 0xc0) != 0x80 ||
4199 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004200 (s[3] & 0xc0) != 0x80 ||
4201 ((unsigned char)s[0] == 0xF0 &&
4202 (unsigned char)s[1] < 0x90) ||
4203 ((unsigned char)s[0] == 0xF4 &&
4204 (unsigned char)s[1] > 0x8F)) {
4205 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004207 endinpos = startinpos + 1;
4208 if ((s[1] & 0xC0) == 0x80) {
4209 endinpos++;
4210 if ((s[2] & 0xC0) == 0x80)
4211 endinpos++;
4212 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 goto utf8Error;
4214 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004215 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004216 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4217 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004219 /* If the string is flexible or we have native UCS-4, write
4220 directly.. */
4221 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4222 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004224 else {
4225 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004227 /* translate from 10000..10FFFF to 0..FFFF */
4228 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004230 /* high surrogate = top 10 bits added to D800 */
4231 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4232 (Py_UNICODE)(0xD800 + (ch >> 10)));
4233
4234 /* low surrogate = bottom 10 bits added to DC00 */
4235 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4236 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4237 }
4238#if SIZEOF_WCHAR_T == 2
4239 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004240#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004241 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242 }
4243 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004245
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004247 /* If this is not yet a resizable string, make it one.. */
4248 if (kind != PyUnicode_WCHAR_KIND) {
4249 const Py_UNICODE *u;
4250 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4251 if (!new_unicode)
4252 goto onError;
4253 u = PyUnicode_AsUnicode((PyObject *)unicode);
4254 if (!u)
4255 goto onError;
4256#if SIZEOF_WCHAR_T == 2
4257 i += wchar_offset;
4258#endif
4259 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4260 Py_DECREF(unicode);
4261 unicode = new_unicode;
4262 kind = 0;
4263 data = PyUnicode_AS_UNICODE(new_unicode);
4264 assert(data != NULL);
4265 }
4266 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 if (unicode_decode_call_errorhandler(
4268 errors, &errorHandler,
4269 "utf8", errmsg,
4270 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004271 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004272 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004273 /* Update data because unicode_decode_call_errorhandler might have
4274 re-created or resized the unicode object. */
4275 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004278 /* Ensure the unicode_size calculation above was correct: */
4279 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4280
Walter Dörwald69652032004-09-07 20:24:22 +00004281 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004284 /* Adjust length and ready string when it contained errors and
4285 is of the old resizable kind. */
4286 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004287 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004288 goto onError;
4289 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 Py_XDECREF(errorHandler);
4292 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004293#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004294 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004295 Py_DECREF(unicode);
4296 return NULL;
4297 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004298#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 return (PyObject *)unicode;
4300
Benjamin Peterson29060642009-01-31 22:14:21 +00004301 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 Py_XDECREF(errorHandler);
4303 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304 Py_DECREF(unicode);
4305 return NULL;
4306}
4307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004308#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004309
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004310#ifdef __APPLE__
4311
4312/* Simplified UTF-8 decoder using surrogateescape error handler,
4313 used to decode the command line arguments on Mac OS X. */
4314
4315wchar_t*
4316_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4317{
4318 int n;
4319 const char *e;
4320 wchar_t *unicode, *p;
4321
4322 /* Note: size will always be longer than the resulting Unicode
4323 character count */
4324 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4325 PyErr_NoMemory();
4326 return NULL;
4327 }
4328 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4329 if (!unicode)
4330 return NULL;
4331
4332 /* Unpack UTF-8 encoded data */
4333 p = unicode;
4334 e = s + size;
4335 while (s < e) {
4336 Py_UCS4 ch = (unsigned char)*s;
4337
4338 if (ch < 0x80) {
4339 *p++ = (wchar_t)ch;
4340 s++;
4341 continue;
4342 }
4343
4344 n = utf8_code_length[ch];
4345 if (s + n > e) {
4346 goto surrogateescape;
4347 }
4348
4349 switch (n) {
4350 case 0:
4351 case 1:
4352 goto surrogateescape;
4353
4354 case 2:
4355 if ((s[1] & 0xc0) != 0x80)
4356 goto surrogateescape;
4357 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4358 assert ((ch > 0x007F) && (ch <= 0x07FF));
4359 *p++ = (wchar_t)ch;
4360 break;
4361
4362 case 3:
4363 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4364 will result in surrogates in range d800-dfff. Surrogates are
4365 not valid UTF-8 so they are rejected.
4366 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4367 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4368 if ((s[1] & 0xc0) != 0x80 ||
4369 (s[2] & 0xc0) != 0x80 ||
4370 ((unsigned char)s[0] == 0xE0 &&
4371 (unsigned char)s[1] < 0xA0) ||
4372 ((unsigned char)s[0] == 0xED &&
4373 (unsigned char)s[1] > 0x9F)) {
4374
4375 goto surrogateescape;
4376 }
4377 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4378 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004379 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004380 break;
4381
4382 case 4:
4383 if ((s[1] & 0xc0) != 0x80 ||
4384 (s[2] & 0xc0) != 0x80 ||
4385 (s[3] & 0xc0) != 0x80 ||
4386 ((unsigned char)s[0] == 0xF0 &&
4387 (unsigned char)s[1] < 0x90) ||
4388 ((unsigned char)s[0] == 0xF4 &&
4389 (unsigned char)s[1] > 0x8F)) {
4390 goto surrogateescape;
4391 }
4392 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4393 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4394 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4395
4396#if SIZEOF_WCHAR_T == 4
4397 *p++ = (wchar_t)ch;
4398#else
4399 /* compute and append the two surrogates: */
4400
4401 /* translate from 10000..10FFFF to 0..FFFF */
4402 ch -= 0x10000;
4403
4404 /* high surrogate = top 10 bits added to D800 */
4405 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4406
4407 /* low surrogate = bottom 10 bits added to DC00 */
4408 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4409#endif
4410 break;
4411 }
4412 s += n;
4413 continue;
4414
4415 surrogateescape:
4416 *p++ = 0xDC00 + ch;
4417 s++;
4418 }
4419 *p = L'\0';
4420 return unicode;
4421}
4422
4423#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004425/* Primary internal function which creates utf8 encoded bytes objects.
4426
4427 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004428 and allocate exactly as much space needed at the end. Else allocate the
4429 maximum possible needed (4 result bytes per Unicode character), and return
4430 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004431*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004432PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004433_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434{
Tim Peters602f7402002-04-27 18:03:26 +00004435#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004436
Guido van Rossum98297ee2007-11-06 21:34:58 +00004437 Py_ssize_t i; /* index into s of next input byte */
4438 PyObject *result; /* result string object */
4439 char *p; /* next free byte in output buffer */
4440 Py_ssize_t nallocated; /* number of result bytes allocated */
4441 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004442 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004443 PyObject *errorHandler = NULL;
4444 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004445 int kind;
4446 void *data;
4447 Py_ssize_t size;
4448 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4449#if SIZEOF_WCHAR_T == 2
4450 Py_ssize_t wchar_offset = 0;
4451#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004453 if (!PyUnicode_Check(unicode)) {
4454 PyErr_BadArgument();
4455 return NULL;
4456 }
4457
4458 if (PyUnicode_READY(unicode) == -1)
4459 return NULL;
4460
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004461 if (PyUnicode_UTF8(unicode))
4462 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4463 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004464
4465 kind = PyUnicode_KIND(unicode);
4466 data = PyUnicode_DATA(unicode);
4467 size = PyUnicode_GET_LENGTH(unicode);
4468
Tim Peters602f7402002-04-27 18:03:26 +00004469 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470
Tim Peters602f7402002-04-27 18:03:26 +00004471 if (size <= MAX_SHORT_UNICHARS) {
4472 /* Write into the stack buffer; nallocated can't overflow.
4473 * At the end, we'll allocate exactly as much heap space as it
4474 * turns out we need.
4475 */
4476 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004477 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004478 p = stackbuf;
4479 }
4480 else {
4481 /* Overallocate on the heap, and give the excess back at the end. */
4482 nallocated = size * 4;
4483 if (nallocated / 4 != size) /* overflow! */
4484 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004485 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004486 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004487 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004488 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004489 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004490
Tim Peters602f7402002-04-27 18:03:26 +00004491 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004492 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004493
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004494 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004495 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004497
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004499 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004500 *p++ = (char)(0xc0 | (ch >> 6));
4501 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004502 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004503 Py_ssize_t newpos;
4504 PyObject *rep;
4505 Py_ssize_t repsize, k, startpos;
4506 startpos = i-1;
4507#if SIZEOF_WCHAR_T == 2
4508 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004509#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004510 rep = unicode_encode_call_errorhandler(
4511 errors, &errorHandler, "utf-8", "surrogates not allowed",
4512 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4513 &exc, startpos, startpos+1, &newpos);
4514 if (!rep)
4515 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004517 if (PyBytes_Check(rep))
4518 repsize = PyBytes_GET_SIZE(rep);
4519 else
4520 repsize = PyUnicode_GET_SIZE(rep);
4521
4522 if (repsize > 4) {
4523 Py_ssize_t offset;
4524
4525 if (result == NULL)
4526 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004527 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004528 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004530 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4531 /* integer overflow */
4532 PyErr_NoMemory();
4533 goto error;
4534 }
4535 nallocated += repsize - 4;
4536 if (result != NULL) {
4537 if (_PyBytes_Resize(&result, nallocated) < 0)
4538 goto error;
4539 } else {
4540 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004541 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004542 goto error;
4543 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4544 }
4545 p = PyBytes_AS_STRING(result) + offset;
4546 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004548 if (PyBytes_Check(rep)) {
4549 char *prep = PyBytes_AS_STRING(rep);
4550 for(k = repsize; k > 0; k--)
4551 *p++ = *prep++;
4552 } else /* rep is unicode */ {
4553 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4554 Py_UNICODE c;
4555
4556 for(k=0; k<repsize; k++) {
4557 c = prep[k];
4558 if (0x80 <= c) {
4559 raise_encode_exception(&exc, "utf-8",
4560 PyUnicode_AS_UNICODE(unicode),
4561 size, i-1, i,
4562 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004563 goto error;
4564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004565 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004566 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004568 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004569 } else if (ch < 0x10000) {
4570 *p++ = (char)(0xe0 | (ch >> 12));
4571 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4572 *p++ = (char)(0x80 | (ch & 0x3f));
4573 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004574 /* Encode UCS4 Unicode ordinals */
4575 *p++ = (char)(0xf0 | (ch >> 18));
4576 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4577 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4578 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004579#if SIZEOF_WCHAR_T == 2
4580 wchar_offset++;
4581#endif
Tim Peters602f7402002-04-27 18:03:26 +00004582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004584
Guido van Rossum98297ee2007-11-06 21:34:58 +00004585 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004586 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004587 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004588 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004589 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004590 }
4591 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004592 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004593 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004594 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004595 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004597
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004598 Py_XDECREF(errorHandler);
4599 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004600 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004601 error:
4602 Py_XDECREF(errorHandler);
4603 Py_XDECREF(exc);
4604 Py_XDECREF(result);
4605 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004606
Tim Peters602f7402002-04-27 18:03:26 +00004607#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608}
4609
Alexander Belopolsky40018472011-02-26 01:02:56 +00004610PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004611PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4612 Py_ssize_t size,
4613 const char *errors)
4614{
4615 PyObject *v, *unicode;
4616
4617 unicode = PyUnicode_FromUnicode(s, size);
4618 if (unicode == NULL)
4619 return NULL;
4620 v = _PyUnicode_AsUTF8String(unicode, errors);
4621 Py_DECREF(unicode);
4622 return v;
4623}
4624
4625PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004626PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004628 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629}
4630
Walter Dörwald41980ca2007-08-16 21:55:45 +00004631/* --- UTF-32 Codec ------------------------------------------------------- */
4632
4633PyObject *
4634PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 Py_ssize_t size,
4636 const char *errors,
4637 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004638{
4639 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4640}
4641
4642PyObject *
4643PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 Py_ssize_t size,
4645 const char *errors,
4646 int *byteorder,
4647 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004648{
4649 const char *starts = s;
4650 Py_ssize_t startinpos;
4651 Py_ssize_t endinpos;
4652 Py_ssize_t outpos;
4653 PyUnicodeObject *unicode;
4654 Py_UNICODE *p;
4655#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004656 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004657 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004658#else
4659 const int pairs = 0;
4660#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004661 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004662 int bo = 0; /* assume native ordering by default */
4663 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004664 /* Offsets from q for retrieving bytes in the right order. */
4665#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4666 int iorder[] = {0, 1, 2, 3};
4667#else
4668 int iorder[] = {3, 2, 1, 0};
4669#endif
4670 PyObject *errorHandler = NULL;
4671 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004672
Walter Dörwald41980ca2007-08-16 21:55:45 +00004673 q = (unsigned char *)s;
4674 e = q + size;
4675
4676 if (byteorder)
4677 bo = *byteorder;
4678
4679 /* Check for BOM marks (U+FEFF) in the input and adjust current
4680 byte order setting accordingly. In native mode, the leading BOM
4681 mark is skipped, in all other modes, it is copied to the output
4682 stream as-is (giving a ZWNBSP character). */
4683 if (bo == 0) {
4684 if (size >= 4) {
4685 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004687#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 if (bom == 0x0000FEFF) {
4689 q += 4;
4690 bo = -1;
4691 }
4692 else if (bom == 0xFFFE0000) {
4693 q += 4;
4694 bo = 1;
4695 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004696#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 if (bom == 0x0000FEFF) {
4698 q += 4;
4699 bo = 1;
4700 }
4701 else if (bom == 0xFFFE0000) {
4702 q += 4;
4703 bo = -1;
4704 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004705#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004707 }
4708
4709 if (bo == -1) {
4710 /* force LE */
4711 iorder[0] = 0;
4712 iorder[1] = 1;
4713 iorder[2] = 2;
4714 iorder[3] = 3;
4715 }
4716 else if (bo == 1) {
4717 /* force BE */
4718 iorder[0] = 3;
4719 iorder[1] = 2;
4720 iorder[2] = 1;
4721 iorder[3] = 0;
4722 }
4723
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004724 /* On narrow builds we split characters outside the BMP into two
4725 codepoints => count how much extra space we need. */
4726#ifndef Py_UNICODE_WIDE
4727 for (qq = q; qq < e; qq += 4)
4728 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4729 pairs++;
4730#endif
4731
4732 /* This might be one to much, because of a BOM */
4733 unicode = _PyUnicode_New((size+3)/4+pairs);
4734 if (!unicode)
4735 return NULL;
4736 if (size == 0)
4737 return (PyObject *)unicode;
4738
4739 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004740 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004741
Walter Dörwald41980ca2007-08-16 21:55:45 +00004742 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 Py_UCS4 ch;
4744 /* remaining bytes at the end? (size should be divisible by 4) */
4745 if (e-q<4) {
4746 if (consumed)
4747 break;
4748 errmsg = "truncated data";
4749 startinpos = ((const char *)q)-starts;
4750 endinpos = ((const char *)e)-starts;
4751 goto utf32Error;
4752 /* The remaining input chars are ignored if the callback
4753 chooses to skip the input */
4754 }
4755 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4756 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004757
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 if (ch >= 0x110000)
4759 {
4760 errmsg = "codepoint not in range(0x110000)";
4761 startinpos = ((const char *)q)-starts;
4762 endinpos = startinpos+4;
4763 goto utf32Error;
4764 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004765#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 if (ch >= 0x10000)
4767 {
4768 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4769 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4770 }
4771 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004772#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 *p++ = ch;
4774 q += 4;
4775 continue;
4776 utf32Error:
4777 outpos = p-PyUnicode_AS_UNICODE(unicode);
4778 if (unicode_decode_call_errorhandler(
4779 errors, &errorHandler,
4780 "utf32", errmsg,
4781 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4782 &unicode, &outpos, &p))
4783 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004784 }
4785
4786 if (byteorder)
4787 *byteorder = bo;
4788
4789 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004790 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004791
4792 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004793 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004794 goto onError;
4795
4796 Py_XDECREF(errorHandler);
4797 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004798#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004799 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004800 Py_DECREF(unicode);
4801 return NULL;
4802 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004803#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00004804 return (PyObject *)unicode;
4805
Benjamin Peterson29060642009-01-31 22:14:21 +00004806 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004807 Py_DECREF(unicode);
4808 Py_XDECREF(errorHandler);
4809 Py_XDECREF(exc);
4810 return NULL;
4811}
4812
4813PyObject *
4814PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004815 Py_ssize_t size,
4816 const char *errors,
4817 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004818{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004819 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004820 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004821 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004822#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004823 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004824#else
4825 const int pairs = 0;
4826#endif
4827 /* Offsets from p for storing byte pairs in the right order. */
4828#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4829 int iorder[] = {0, 1, 2, 3};
4830#else
4831 int iorder[] = {3, 2, 1, 0};
4832#endif
4833
Benjamin Peterson29060642009-01-31 22:14:21 +00004834#define STORECHAR(CH) \
4835 do { \
4836 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4837 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4838 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4839 p[iorder[0]] = (CH) & 0xff; \
4840 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004841 } while(0)
4842
4843 /* In narrow builds we can output surrogate pairs as one codepoint,
4844 so we need less space. */
4845#ifndef Py_UNICODE_WIDE
4846 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004847 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4848 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4849 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004850#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004851 nsize = (size - pairs + (byteorder == 0));
4852 bytesize = nsize * 4;
4853 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004855 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004856 if (v == NULL)
4857 return NULL;
4858
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004859 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004860 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004862 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004863 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004864
4865 if (byteorder == -1) {
4866 /* force LE */
4867 iorder[0] = 0;
4868 iorder[1] = 1;
4869 iorder[2] = 2;
4870 iorder[3] = 3;
4871 }
4872 else if (byteorder == 1) {
4873 /* force BE */
4874 iorder[0] = 3;
4875 iorder[1] = 2;
4876 iorder[2] = 1;
4877 iorder[3] = 0;
4878 }
4879
4880 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004881 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004882#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004883 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4884 Py_UCS4 ch2 = *s;
4885 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4886 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4887 s++;
4888 size--;
4889 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004890 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004891#endif
4892 STORECHAR(ch);
4893 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004894
4895 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004896 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004897#undef STORECHAR
4898}
4899
Alexander Belopolsky40018472011-02-26 01:02:56 +00004900PyObject *
4901PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004902{
4903 if (!PyUnicode_Check(unicode)) {
4904 PyErr_BadArgument();
4905 return NULL;
4906 }
4907 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004908 PyUnicode_GET_SIZE(unicode),
4909 NULL,
4910 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004911}
4912
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913/* --- UTF-16 Codec ------------------------------------------------------- */
4914
Tim Peters772747b2001-08-09 22:21:55 +00004915PyObject *
4916PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 Py_ssize_t size,
4918 const char *errors,
4919 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920{
Walter Dörwald69652032004-09-07 20:24:22 +00004921 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4922}
4923
Antoine Pitrouab868312009-01-10 15:40:25 +00004924/* Two masks for fast checking of whether a C 'long' may contain
4925 UTF16-encoded surrogate characters. This is an efficient heuristic,
4926 assuming that non-surrogate characters with a code point >= 0x8000 are
4927 rare in most input.
4928 FAST_CHAR_MASK is used when the input is in native byte ordering,
4929 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004930*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004931#if (SIZEOF_LONG == 8)
4932# define FAST_CHAR_MASK 0x8000800080008000L
4933# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4934#elif (SIZEOF_LONG == 4)
4935# define FAST_CHAR_MASK 0x80008000L
4936# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4937#else
4938# error C 'long' size should be either 4 or 8!
4939#endif
4940
Walter Dörwald69652032004-09-07 20:24:22 +00004941PyObject *
4942PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 Py_ssize_t size,
4944 const char *errors,
4945 int *byteorder,
4946 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004947{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004948 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004949 Py_ssize_t startinpos;
4950 Py_ssize_t endinpos;
4951 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 PyUnicodeObject *unicode;
4953 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004954 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004955 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004956 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004957 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004958 /* Offsets from q for retrieving byte pairs in the right order. */
4959#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4960 int ihi = 1, ilo = 0;
4961#else
4962 int ihi = 0, ilo = 1;
4963#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 PyObject *errorHandler = NULL;
4965 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966
4967 /* Note: size will always be longer than the resulting Unicode
4968 character count */
4969 unicode = _PyUnicode_New(size);
4970 if (!unicode)
4971 return NULL;
4972 if (size == 0)
4973 return (PyObject *)unicode;
4974
4975 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004976 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004977 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004978 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979
4980 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004981 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004983 /* Check for BOM marks (U+FEFF) in the input and adjust current
4984 byte order setting accordingly. In native mode, the leading BOM
4985 mark is skipped, in all other modes, it is copied to the output
4986 stream as-is (giving a ZWNBSP character). */
4987 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004988 if (size >= 2) {
4989 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004990#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004991 if (bom == 0xFEFF) {
4992 q += 2;
4993 bo = -1;
4994 }
4995 else if (bom == 0xFFFE) {
4996 q += 2;
4997 bo = 1;
4998 }
Tim Petersced69f82003-09-16 20:30:58 +00004999#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005000 if (bom == 0xFEFF) {
5001 q += 2;
5002 bo = 1;
5003 }
5004 else if (bom == 0xFFFE) {
5005 q += 2;
5006 bo = -1;
5007 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005008#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005011
Tim Peters772747b2001-08-09 22:21:55 +00005012 if (bo == -1) {
5013 /* force LE */
5014 ihi = 1;
5015 ilo = 0;
5016 }
5017 else if (bo == 1) {
5018 /* force BE */
5019 ihi = 0;
5020 ilo = 1;
5021 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005022#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5023 native_ordering = ilo < ihi;
5024#else
5025 native_ordering = ilo > ihi;
5026#endif
Tim Peters772747b2001-08-09 22:21:55 +00005027
Antoine Pitrouab868312009-01-10 15:40:25 +00005028 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005029 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005031 /* First check for possible aligned read of a C 'long'. Unaligned
5032 reads are more expensive, better to defer to another iteration. */
5033 if (!((size_t) q & LONG_PTR_MASK)) {
5034 /* Fast path for runs of non-surrogate chars. */
5035 register const unsigned char *_q = q;
5036 Py_UNICODE *_p = p;
5037 if (native_ordering) {
5038 /* Native ordering is simple: as long as the input cannot
5039 possibly contain a surrogate char, do an unrolled copy
5040 of several 16-bit code points to the target object.
5041 The non-surrogate check is done on several input bytes
5042 at a time (as many as a C 'long' can contain). */
5043 while (_q < aligned_end) {
5044 unsigned long data = * (unsigned long *) _q;
5045 if (data & FAST_CHAR_MASK)
5046 break;
5047 _p[0] = ((unsigned short *) _q)[0];
5048 _p[1] = ((unsigned short *) _q)[1];
5049#if (SIZEOF_LONG == 8)
5050 _p[2] = ((unsigned short *) _q)[2];
5051 _p[3] = ((unsigned short *) _q)[3];
5052#endif
5053 _q += SIZEOF_LONG;
5054 _p += SIZEOF_LONG / 2;
5055 }
5056 }
5057 else {
5058 /* Byteswapped ordering is similar, but we must decompose
5059 the copy bytewise, and take care of zero'ing out the
5060 upper bytes if the target object is in 32-bit units
5061 (that is, in UCS-4 builds). */
5062 while (_q < aligned_end) {
5063 unsigned long data = * (unsigned long *) _q;
5064 if (data & SWAPPED_FAST_CHAR_MASK)
5065 break;
5066 /* Zero upper bytes in UCS-4 builds */
5067#if (Py_UNICODE_SIZE > 2)
5068 _p[0] = 0;
5069 _p[1] = 0;
5070#if (SIZEOF_LONG == 8)
5071 _p[2] = 0;
5072 _p[3] = 0;
5073#endif
5074#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005075 /* Issue #4916; UCS-4 builds on big endian machines must
5076 fill the two last bytes of each 4-byte unit. */
5077#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5078# define OFF 2
5079#else
5080# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005081#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005082 ((unsigned char *) _p)[OFF + 1] = _q[0];
5083 ((unsigned char *) _p)[OFF + 0] = _q[1];
5084 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5085 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5086#if (SIZEOF_LONG == 8)
5087 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5088 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5089 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5090 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5091#endif
5092#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005093 _q += SIZEOF_LONG;
5094 _p += SIZEOF_LONG / 2;
5095 }
5096 }
5097 p = _p;
5098 q = _q;
5099 if (q >= e)
5100 break;
5101 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103
Benjamin Peterson14339b62009-01-31 16:36:08 +00005104 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005105
5106 if (ch < 0xD800 || ch > 0xDFFF) {
5107 *p++ = ch;
5108 continue;
5109 }
5110
5111 /* UTF-16 code pair: */
5112 if (q > e) {
5113 errmsg = "unexpected end of data";
5114 startinpos = (((const char *)q) - 2) - starts;
5115 endinpos = ((const char *)e) + 1 - starts;
5116 goto utf16Error;
5117 }
5118 if (0xD800 <= ch && ch <= 0xDBFF) {
5119 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5120 q += 2;
5121 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005122#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005123 *p++ = ch;
5124 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005125#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005127#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005128 continue;
5129 }
5130 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005131 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 startinpos = (((const char *)q)-4)-starts;
5133 endinpos = startinpos+2;
5134 goto utf16Error;
5135 }
5136
Benjamin Peterson14339b62009-01-31 16:36:08 +00005137 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005138 errmsg = "illegal encoding";
5139 startinpos = (((const char *)q)-2)-starts;
5140 endinpos = startinpos+2;
5141 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005142
Benjamin Peterson29060642009-01-31 22:14:21 +00005143 utf16Error:
5144 outpos = p - PyUnicode_AS_UNICODE(unicode);
5145 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005146 errors,
5147 &errorHandler,
5148 "utf16", errmsg,
5149 &starts,
5150 (const char **)&e,
5151 &startinpos,
5152 &endinpos,
5153 &exc,
5154 (const char **)&q,
5155 &unicode,
5156 &outpos,
5157 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005158 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005160 /* remaining byte at the end? (size should be even) */
5161 if (e == q) {
5162 if (!consumed) {
5163 errmsg = "truncated data";
5164 startinpos = ((const char *)q) - starts;
5165 endinpos = ((const char *)e) + 1 - starts;
5166 outpos = p - PyUnicode_AS_UNICODE(unicode);
5167 if (unicode_decode_call_errorhandler(
5168 errors,
5169 &errorHandler,
5170 "utf16", errmsg,
5171 &starts,
5172 (const char **)&e,
5173 &startinpos,
5174 &endinpos,
5175 &exc,
5176 (const char **)&q,
5177 &unicode,
5178 &outpos,
5179 &p))
5180 goto onError;
5181 /* The remaining input chars are ignored if the callback
5182 chooses to skip the input */
5183 }
5184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185
5186 if (byteorder)
5187 *byteorder = bo;
5188
Walter Dörwald69652032004-09-07 20:24:22 +00005189 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005190 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005191
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005193 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 goto onError;
5195
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196 Py_XDECREF(errorHandler);
5197 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005198#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005199 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005200 Py_DECREF(unicode);
5201 return NULL;
5202 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005203#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 return (PyObject *)unicode;
5205
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005208 Py_XDECREF(errorHandler);
5209 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 return NULL;
5211}
5212
Antoine Pitrouab868312009-01-10 15:40:25 +00005213#undef FAST_CHAR_MASK
5214#undef SWAPPED_FAST_CHAR_MASK
5215
Tim Peters772747b2001-08-09 22:21:55 +00005216PyObject *
5217PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 Py_ssize_t size,
5219 const char *errors,
5220 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005222 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005223 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005224 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005225#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005226 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005227#else
5228 const int pairs = 0;
5229#endif
Tim Peters772747b2001-08-09 22:21:55 +00005230 /* Offsets from p for storing byte pairs in the right order. */
5231#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5232 int ihi = 1, ilo = 0;
5233#else
5234 int ihi = 0, ilo = 1;
5235#endif
5236
Benjamin Peterson29060642009-01-31 22:14:21 +00005237#define STORECHAR(CH) \
5238 do { \
5239 p[ihi] = ((CH) >> 8) & 0xff; \
5240 p[ilo] = (CH) & 0xff; \
5241 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005242 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005244#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005245 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 if (s[i] >= 0x10000)
5247 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005248#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005249 /* 2 * (size + pairs + (byteorder == 0)) */
5250 if (size > PY_SSIZE_T_MAX ||
5251 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005253 nsize = size + pairs + (byteorder == 0);
5254 bytesize = nsize * 2;
5255 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005256 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005257 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 if (v == NULL)
5259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005261 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005264 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005265 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005266
5267 if (byteorder == -1) {
5268 /* force LE */
5269 ihi = 1;
5270 ilo = 0;
5271 }
5272 else if (byteorder == 1) {
5273 /* force BE */
5274 ihi = 0;
5275 ilo = 1;
5276 }
5277
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005278 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 Py_UNICODE ch = *s++;
5280 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005281#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 if (ch >= 0x10000) {
5283 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5284 ch = 0xD800 | ((ch-0x10000) >> 10);
5285 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005286#endif
Tim Peters772747b2001-08-09 22:21:55 +00005287 STORECHAR(ch);
5288 if (ch2)
5289 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005290 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005291
5292 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005293 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005294#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295}
5296
Alexander Belopolsky40018472011-02-26 01:02:56 +00005297PyObject *
5298PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299{
5300 if (!PyUnicode_Check(unicode)) {
5301 PyErr_BadArgument();
5302 return NULL;
5303 }
5304 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 PyUnicode_GET_SIZE(unicode),
5306 NULL,
5307 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308}
5309
5310/* --- Unicode Escape Codec ----------------------------------------------- */
5311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005312/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5313 if all the escapes in the string make it still a valid ASCII string.
5314 Returns -1 if any escapes were found which cause the string to
5315 pop out of ASCII range. Otherwise returns the length of the
5316 required buffer to hold the string.
5317 */
5318Py_ssize_t
5319length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5320{
5321 const unsigned char *p = (const unsigned char *)s;
5322 const unsigned char *end = p + size;
5323 Py_ssize_t length = 0;
5324
5325 if (size < 0)
5326 return -1;
5327
5328 for (; p < end; ++p) {
5329 if (*p > 127) {
5330 /* Non-ASCII */
5331 return -1;
5332 }
5333 else if (*p != '\\') {
5334 /* Normal character */
5335 ++length;
5336 }
5337 else {
5338 /* Backslash-escape, check next char */
5339 ++p;
5340 /* Escape sequence reaches till end of string or
5341 non-ASCII follow-up. */
5342 if (p >= end || *p > 127)
5343 return -1;
5344 switch (*p) {
5345 case '\n':
5346 /* backslash + \n result in zero characters */
5347 break;
5348 case '\\': case '\'': case '\"':
5349 case 'b': case 'f': case 't':
5350 case 'n': case 'r': case 'v': case 'a':
5351 ++length;
5352 break;
5353 case '0': case '1': case '2': case '3':
5354 case '4': case '5': case '6': case '7':
5355 case 'x': case 'u': case 'U': case 'N':
5356 /* these do not guarantee ASCII characters */
5357 return -1;
5358 default:
5359 /* count the backslash + the other character */
5360 length += 2;
5361 }
5362 }
5363 }
5364 return length;
5365}
5366
5367/* Similar to PyUnicode_WRITE but either write into wstr field
5368 or treat string as ASCII. */
5369#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5370 do { \
5371 if ((kind) != PyUnicode_WCHAR_KIND) \
5372 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5373 else \
5374 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5375 } while (0)
5376
5377#define WRITE_WSTR(buf, index, value) \
5378 assert(kind == PyUnicode_WCHAR_KIND), \
5379 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5380
5381
Fredrik Lundh06d12682001-01-24 07:59:11 +00005382static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005383
Alexander Belopolsky40018472011-02-26 01:02:56 +00005384PyObject *
5385PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005386 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005387 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005390 Py_ssize_t startinpos;
5391 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005392 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005394 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005396 char* message;
5397 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005398 PyObject *errorHandler = NULL;
5399 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005400 Py_ssize_t ascii_length;
5401 Py_ssize_t i;
5402 int kind;
5403 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005405 ascii_length = length_of_escaped_ascii_string(s, size);
5406
5407 /* After length_of_escaped_ascii_string() there are two alternatives,
5408 either the string is pure ASCII with named escapes like \n, etc.
5409 and we determined it's exact size (common case)
5410 or it contains \x, \u, ... escape sequences. then we create a
5411 legacy wchar string and resize it at the end of this function. */
5412 if (ascii_length >= 0) {
5413 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5414 if (!v)
5415 goto onError;
5416 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5417 kind = PyUnicode_1BYTE_KIND;
5418 data = PyUnicode_DATA(v);
5419 }
5420 else {
5421 /* Escaped strings will always be longer than the resulting
5422 Unicode string, so we start with size here and then reduce the
5423 length after conversion to the true value.
5424 (but if the error callback returns a long replacement string
5425 we'll have to allocate more space) */
5426 v = _PyUnicode_New(size);
5427 if (!v)
5428 goto onError;
5429 kind = PyUnicode_WCHAR_KIND;
5430 data = PyUnicode_AS_UNICODE(v);
5431 }
5432
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 if (size == 0)
5434 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005435 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005437
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 while (s < end) {
5439 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005440 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005441 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005443 if (kind == PyUnicode_WCHAR_KIND) {
5444 assert(i < _PyUnicode_WSTR_LENGTH(v));
5445 }
5446 else {
5447 /* The only case in which i == ascii_length is a backslash
5448 followed by a newline. */
5449 assert(i <= ascii_length);
5450 }
5451
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 /* Non-escape characters are interpreted as Unicode ordinals */
5453 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005454 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 continue;
5456 }
5457
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 /* \ - Escapes */
5460 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005461 c = *s++;
5462 if (s > end)
5463 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005464
5465 if (kind == PyUnicode_WCHAR_KIND) {
5466 assert(i < _PyUnicode_WSTR_LENGTH(v));
5467 }
5468 else {
5469 /* The only case in which i == ascii_length is a backslash
5470 followed by a newline. */
5471 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5472 }
5473
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005474 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005478 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5479 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5480 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5481 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5482 /* FF */
5483 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5484 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5485 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5486 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5487 /* VT */
5488 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5489 /* BEL, not classic C */
5490 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 case '0': case '1': case '2': case '3':
5494 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005495 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005496 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005497 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005498 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005499 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005501 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 break;
5503
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 /* hex escapes */
5505 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005507 digits = 2;
5508 message = "truncated \\xXX escape";
5509 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005513 digits = 4;
5514 message = "truncated \\uXXXX escape";
5515 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005518 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005519 digits = 8;
5520 message = "truncated \\UXXXXXXXX escape";
5521 hexescape:
5522 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005523 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 if (s+digits>end) {
5525 endinpos = size;
5526 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 errors, &errorHandler,
5528 "unicodeescape", "end of string in escape sequence",
5529 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005530 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005531 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005532 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005533 goto nextByte;
5534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005535 for (j = 0; j < digits; ++j) {
5536 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005537 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005538 endinpos = (s+j+1)-starts;
5539 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005540 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 errors, &errorHandler,
5542 "unicodeescape", message,
5543 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005544 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005545 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005546 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005547 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005548 }
5549 chr = (chr<<4) & ~0xF;
5550 if (c >= '0' && c <= '9')
5551 chr += c - '0';
5552 else if (c >= 'a' && c <= 'f')
5553 chr += 10 + c - 'a';
5554 else
5555 chr += 10 + c - 'A';
5556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005558 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559 /* _decoding_error will have already written into the
5560 target buffer. */
5561 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005562 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005563 /* when we get here, chr is a 32-bit unicode character */
5564 if (chr <= 0xffff)
5565 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005566 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005567 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005568 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005569 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005570#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005571 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005572#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005573 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005574 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5575 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005576#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005577 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005578 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005579 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 errors, &errorHandler,
5582 "unicodeescape", "illegal Unicode character",
5583 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005584 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005585 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005586 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005587 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005588 break;
5589
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005591 case 'N':
5592 message = "malformed \\N character escape";
5593 if (ucnhash_CAPI == NULL) {
5594 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005595 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5596 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005597 if (ucnhash_CAPI == NULL)
5598 goto ucnhashError;
5599 }
5600 if (*s == '{') {
5601 const char *start = s+1;
5602 /* look for the closing brace */
5603 while (*s != '}' && s < end)
5604 s++;
5605 if (s > start && s < end && *s == '}') {
5606 /* found a name. look it up in the unicode database */
5607 message = "unknown Unicode character name";
5608 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005609 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5610 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005611 goto store;
5612 }
5613 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005614 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005615 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005616 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 errors, &errorHandler,
5618 "unicodeescape", message,
5619 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005620 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005621 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005622 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005623 break;
5624
5625 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005626 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005627 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628 message = "\\ at end of string";
5629 s--;
5630 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 errors, &errorHandler,
5634 "unicodeescape", message,
5635 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005636 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005637 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005638 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005639 }
5640 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005641 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5642 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005643 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005644 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005649 /* Ensure the length prediction worked in case of ASCII strings */
5650 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5651
Victor Stinnerfe226c02011-10-03 03:52:20 +02005652 if (kind == PyUnicode_WCHAR_KIND)
5653 {
5654 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5655 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005656 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005657 Py_XDECREF(errorHandler);
5658 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005659#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005660 if (_PyUnicode_READY_REPLACE(&v)) {
5661 Py_DECREF(v);
5662 return NULL;
5663 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005664#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005666
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005668 PyErr_SetString(
5669 PyExc_UnicodeError,
5670 "\\N escapes not supported (can't load unicodedata module)"
5671 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005672 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005673 Py_XDECREF(errorHandler);
5674 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005675 return NULL;
5676
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679 Py_XDECREF(errorHandler);
5680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 return NULL;
5682}
5683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005684#undef WRITE_ASCII_OR_WSTR
5685#undef WRITE_WSTR
5686
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687/* Return a Unicode-Escape string version of the Unicode object.
5688
5689 If quotes is true, the string is enclosed in u"" or u'' quotes as
5690 appropriate.
5691
5692*/
5693
Walter Dörwald79e913e2007-05-12 11:08:06 +00005694static const char *hexdigits = "0123456789abcdef";
5695
Alexander Belopolsky40018472011-02-26 01:02:56 +00005696PyObject *
5697PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005698 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005700 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005703#ifdef Py_UNICODE_WIDE
5704 const Py_ssize_t expandsize = 10;
5705#else
5706 const Py_ssize_t expandsize = 6;
5707#endif
5708
Thomas Wouters89f507f2006-12-13 04:49:30 +00005709 /* XXX(nnorwitz): rather than over-allocating, it would be
5710 better to choose a different scheme. Perhaps scan the
5711 first N-chars of the string and allocate based on that size.
5712 */
5713 /* Initial allocation is based on the longest-possible unichr
5714 escape.
5715
5716 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5717 unichr, so in this case it's the longest unichr escape. In
5718 narrow (UTF-16) builds this is five chars per source unichr
5719 since there are two unichrs in the surrogate pair, so in narrow
5720 (UTF-16) builds it's not the longest unichr escape.
5721
5722 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5723 so in the narrow (UTF-16) build case it's the longest unichr
5724 escape.
5725 */
5726
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005727 if (size == 0)
5728 return PyBytes_FromStringAndSize(NULL, 0);
5729
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005730 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005731 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005732
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005733 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 2
5735 + expandsize*size
5736 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 if (repr == NULL)
5738 return NULL;
5739
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005740 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 while (size-- > 0) {
5743 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005744
Walter Dörwald79e913e2007-05-12 11:08:06 +00005745 /* Escape backslashes */
5746 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 *p++ = '\\';
5748 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005749 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005750 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005751
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005752#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005753 /* Map 21-bit characters to '\U00xxxxxx' */
5754 else if (ch >= 0x10000) {
5755 *p++ = '\\';
5756 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005757 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5758 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5759 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5760 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5761 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5762 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5763 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5764 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005766 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005767#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5769 else if (ch >= 0xD800 && ch < 0xDC00) {
5770 Py_UNICODE ch2;
5771 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005772
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 ch2 = *s++;
5774 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005775 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5777 *p++ = '\\';
5778 *p++ = 'U';
5779 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5780 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5781 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5782 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5783 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5784 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5785 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5786 *p++ = hexdigits[ucs & 0x0000000F];
5787 continue;
5788 }
5789 /* Fall through: isolated surrogates are copied as-is */
5790 s--;
5791 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005792 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005793#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005794
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005796 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 *p++ = '\\';
5798 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005799 *p++ = hexdigits[(ch >> 12) & 0x000F];
5800 *p++ = hexdigits[(ch >> 8) & 0x000F];
5801 *p++ = hexdigits[(ch >> 4) & 0x000F];
5802 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005804
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005805 /* Map special whitespace to '\t', \n', '\r' */
5806 else if (ch == '\t') {
5807 *p++ = '\\';
5808 *p++ = 't';
5809 }
5810 else if (ch == '\n') {
5811 *p++ = '\\';
5812 *p++ = 'n';
5813 }
5814 else if (ch == '\r') {
5815 *p++ = '\\';
5816 *p++ = 'r';
5817 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005818
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005819 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005820 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005822 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005823 *p++ = hexdigits[(ch >> 4) & 0x000F];
5824 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005825 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005826
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 /* Copy everything else as-is */
5828 else
5829 *p++ = (char) ch;
5830 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005832 assert(p - PyBytes_AS_STRING(repr) > 0);
5833 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5834 return NULL;
5835 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836}
5837
Alexander Belopolsky40018472011-02-26 01:02:56 +00005838PyObject *
5839PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005841 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 if (!PyUnicode_Check(unicode)) {
5843 PyErr_BadArgument();
5844 return NULL;
5845 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005846 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5847 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005848 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849}
5850
5851/* --- Raw Unicode Escape Codec ------------------------------------------- */
5852
Alexander Belopolsky40018472011-02-26 01:02:56 +00005853PyObject *
5854PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005855 Py_ssize_t size,
5856 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005858 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005859 Py_ssize_t startinpos;
5860 Py_ssize_t endinpos;
5861 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 const char *end;
5865 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 PyObject *errorHandler = NULL;
5867 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005868
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 /* Escaped strings will always be longer than the resulting
5870 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005871 length after conversion to the true value. (But decoding error
5872 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 v = _PyUnicode_New(size);
5874 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005878 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 end = s + size;
5880 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 unsigned char c;
5882 Py_UCS4 x;
5883 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005884 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 /* Non-escape characters are interpreted as Unicode ordinals */
5887 if (*s != '\\') {
5888 *p++ = (unsigned char)*s++;
5889 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005890 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 startinpos = s-starts;
5892
5893 /* \u-escapes are only interpreted iff the number of leading
5894 backslashes if odd */
5895 bs = s;
5896 for (;s < end;) {
5897 if (*s != '\\')
5898 break;
5899 *p++ = (unsigned char)*s++;
5900 }
5901 if (((s - bs) & 1) == 0 ||
5902 s >= end ||
5903 (*s != 'u' && *s != 'U')) {
5904 continue;
5905 }
5906 p--;
5907 count = *s=='u' ? 4 : 8;
5908 s++;
5909
5910 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5911 outpos = p-PyUnicode_AS_UNICODE(v);
5912 for (x = 0, i = 0; i < count; ++i, ++s) {
5913 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005914 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 endinpos = s-starts;
5916 if (unicode_decode_call_errorhandler(
5917 errors, &errorHandler,
5918 "rawunicodeescape", "truncated \\uXXXX",
5919 &starts, &end, &startinpos, &endinpos, &exc, &s,
5920 &v, &outpos, &p))
5921 goto onError;
5922 goto nextByte;
5923 }
5924 x = (x<<4) & ~0xF;
5925 if (c >= '0' && c <= '9')
5926 x += c - '0';
5927 else if (c >= 'a' && c <= 'f')
5928 x += 10 + c - 'a';
5929 else
5930 x += 10 + c - 'A';
5931 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005932 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 /* UCS-2 character */
5934 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005935 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 /* UCS-4 character. Either store directly, or as
5937 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005938#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005940#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 x -= 0x10000L;
5942 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5943 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005944#endif
5945 } else {
5946 endinpos = s-starts;
5947 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005948 if (unicode_decode_call_errorhandler(
5949 errors, &errorHandler,
5950 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 &starts, &end, &startinpos, &endinpos, &exc, &s,
5952 &v, &outpos, &p))
5953 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005954 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 nextByte:
5956 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005958 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960 Py_XDECREF(errorHandler);
5961 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005962#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005963 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005964 Py_DECREF(v);
5965 return NULL;
5966 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005967#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005969
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005972 Py_XDECREF(errorHandler);
5973 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 return NULL;
5975}
5976
Alexander Belopolsky40018472011-02-26 01:02:56 +00005977PyObject *
5978PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005979 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005981 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 char *p;
5983 char *q;
5984
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005985#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005986 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005987#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005988 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005989#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005990
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005991 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005993
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005994 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 if (repr == NULL)
5996 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005997 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005998 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006000 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 while (size-- > 0) {
6002 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006003#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 /* Map 32-bit characters to '\Uxxxxxxxx' */
6005 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006006 *p++ = '\\';
6007 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006008 *p++ = hexdigits[(ch >> 28) & 0xf];
6009 *p++ = hexdigits[(ch >> 24) & 0xf];
6010 *p++ = hexdigits[(ch >> 20) & 0xf];
6011 *p++ = hexdigits[(ch >> 16) & 0xf];
6012 *p++ = hexdigits[(ch >> 12) & 0xf];
6013 *p++ = hexdigits[(ch >> 8) & 0xf];
6014 *p++ = hexdigits[(ch >> 4) & 0xf];
6015 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006016 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006017 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006018#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6020 if (ch >= 0xD800 && ch < 0xDC00) {
6021 Py_UNICODE ch2;
6022 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006023
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 ch2 = *s++;
6025 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006026 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6028 *p++ = '\\';
6029 *p++ = 'U';
6030 *p++ = hexdigits[(ucs >> 28) & 0xf];
6031 *p++ = hexdigits[(ucs >> 24) & 0xf];
6032 *p++ = hexdigits[(ucs >> 20) & 0xf];
6033 *p++ = hexdigits[(ucs >> 16) & 0xf];
6034 *p++ = hexdigits[(ucs >> 12) & 0xf];
6035 *p++ = hexdigits[(ucs >> 8) & 0xf];
6036 *p++ = hexdigits[(ucs >> 4) & 0xf];
6037 *p++ = hexdigits[ucs & 0xf];
6038 continue;
6039 }
6040 /* Fall through: isolated surrogates are copied as-is */
6041 s--;
6042 size++;
6043 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006044#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 /* Map 16-bit characters to '\uxxxx' */
6046 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 *p++ = '\\';
6048 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006049 *p++ = hexdigits[(ch >> 12) & 0xf];
6050 *p++ = hexdigits[(ch >> 8) & 0xf];
6051 *p++ = hexdigits[(ch >> 4) & 0xf];
6052 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 /* Copy everything else as-is */
6055 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 *p++ = (char) ch;
6057 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006058 size = p - q;
6059
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006060 assert(size > 0);
6061 if (_PyBytes_Resize(&repr, size) < 0)
6062 return NULL;
6063 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064}
6065
Alexander Belopolsky40018472011-02-26 01:02:56 +00006066PyObject *
6067PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006069 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006071 PyErr_BadArgument();
6072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006074 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6075 PyUnicode_GET_SIZE(unicode));
6076
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006077 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078}
6079
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006080/* --- Unicode Internal Codec ------------------------------------------- */
6081
Alexander Belopolsky40018472011-02-26 01:02:56 +00006082PyObject *
6083_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006084 Py_ssize_t size,
6085 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006086{
6087 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006088 Py_ssize_t startinpos;
6089 Py_ssize_t endinpos;
6090 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006091 PyUnicodeObject *v;
6092 Py_UNICODE *p;
6093 const char *end;
6094 const char *reason;
6095 PyObject *errorHandler = NULL;
6096 PyObject *exc = NULL;
6097
Neal Norwitzd43069c2006-01-08 01:12:10 +00006098#ifdef Py_UNICODE_WIDE
6099 Py_UNICODE unimax = PyUnicode_GetMax();
6100#endif
6101
Thomas Wouters89f507f2006-12-13 04:49:30 +00006102 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006103 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6104 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006106 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6107 as string was created with the old API. */
6108 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006110 p = PyUnicode_AS_UNICODE(v);
6111 end = s + size;
6112
6113 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006114 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006115 /* We have to sanity check the raw data, otherwise doom looms for
6116 some malformed UCS-4 data. */
6117 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006118#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006119 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006120#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006121 end-s < Py_UNICODE_SIZE
6122 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006124 startinpos = s - starts;
6125 if (end-s < Py_UNICODE_SIZE) {
6126 endinpos = end-starts;
6127 reason = "truncated input";
6128 }
6129 else {
6130 endinpos = s - starts + Py_UNICODE_SIZE;
6131 reason = "illegal code point (> 0x10FFFF)";
6132 }
6133 outpos = p - PyUnicode_AS_UNICODE(v);
6134 if (unicode_decode_call_errorhandler(
6135 errors, &errorHandler,
6136 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006137 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006138 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006139 goto onError;
6140 }
6141 }
6142 else {
6143 p++;
6144 s += Py_UNICODE_SIZE;
6145 }
6146 }
6147
Victor Stinnerfe226c02011-10-03 03:52:20 +02006148 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006149 goto onError;
6150 Py_XDECREF(errorHandler);
6151 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006152#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006153 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006154 Py_DECREF(v);
6155 return NULL;
6156 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006157#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006158 return (PyObject *)v;
6159
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006161 Py_XDECREF(v);
6162 Py_XDECREF(errorHandler);
6163 Py_XDECREF(exc);
6164 return NULL;
6165}
6166
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167/* --- Latin-1 Codec ------------------------------------------------------ */
6168
Alexander Belopolsky40018472011-02-26 01:02:56 +00006169PyObject *
6170PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006171 Py_ssize_t size,
6172 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006175 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176}
6177
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006178/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006179static void
6180make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006181 const char *encoding,
6182 const Py_UNICODE *unicode, Py_ssize_t size,
6183 Py_ssize_t startpos, Py_ssize_t endpos,
6184 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006186 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 *exceptionObject = PyUnicodeEncodeError_Create(
6188 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189 }
6190 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6192 goto onError;
6193 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6194 goto onError;
6195 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6196 goto onError;
6197 return;
6198 onError:
6199 Py_DECREF(*exceptionObject);
6200 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 }
6202}
6203
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006204/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006205static void
6206raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006207 const char *encoding,
6208 const Py_UNICODE *unicode, Py_ssize_t size,
6209 Py_ssize_t startpos, Py_ssize_t endpos,
6210 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006211{
6212 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006216}
6217
6218/* error handling callback helper:
6219 build arguments, call the callback and check the arguments,
6220 put the result into newpos and return the replacement string, which
6221 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006222static PyObject *
6223unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006224 PyObject **errorHandler,
6225 const char *encoding, const char *reason,
6226 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6227 Py_ssize_t startpos, Py_ssize_t endpos,
6228 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006229{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006230 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006231
6232 PyObject *restuple;
6233 PyObject *resunicode;
6234
6235 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006237 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006239 }
6240
6241 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006243 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006245
6246 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006248 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006250 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006251 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 Py_DECREF(restuple);
6253 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006254 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006255 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 &resunicode, newpos)) {
6257 Py_DECREF(restuple);
6258 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006259 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006260 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6261 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6262 Py_DECREF(restuple);
6263 return NULL;
6264 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006265 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006267 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6269 Py_DECREF(restuple);
6270 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006271 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 Py_INCREF(resunicode);
6273 Py_DECREF(restuple);
6274 return resunicode;
6275}
6276
Alexander Belopolsky40018472011-02-26 01:02:56 +00006277static PyObject *
6278unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006279 Py_ssize_t size,
6280 const char *errors,
6281 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006282{
6283 /* output object */
6284 PyObject *res;
6285 /* pointers to the beginning and end+1 of input */
6286 const Py_UNICODE *startp = p;
6287 const Py_UNICODE *endp = p + size;
6288 /* pointer to the beginning of the unencodable characters */
6289 /* const Py_UNICODE *badp = NULL; */
6290 /* pointer into the output */
6291 char *str;
6292 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006293 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006294 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6295 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 PyObject *errorHandler = NULL;
6297 PyObject *exc = NULL;
6298 /* the following variable is used for caching string comparisons
6299 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6300 int known_errorHandler = -1;
6301
6302 /* allocate enough for a simple encoding without
6303 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006304 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006305 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006306 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006308 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006309 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006310 ressize = size;
6311
6312 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006314
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 /* can we encode this? */
6316 if (c<limit) {
6317 /* no overflow check, because we know that the space is enough */
6318 *str++ = (char)c;
6319 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006320 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 else {
6322 Py_ssize_t unicodepos = p-startp;
6323 Py_ssize_t requiredsize;
6324 PyObject *repunicode;
6325 Py_ssize_t repsize;
6326 Py_ssize_t newpos;
6327 Py_ssize_t respos;
6328 Py_UNICODE *uni2;
6329 /* startpos for collecting unencodable chars */
6330 const Py_UNICODE *collstart = p;
6331 const Py_UNICODE *collend = p;
6332 /* find all unecodable characters */
6333 while ((collend < endp) && ((*collend)>=limit))
6334 ++collend;
6335 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6336 if (known_errorHandler==-1) {
6337 if ((errors==NULL) || (!strcmp(errors, "strict")))
6338 known_errorHandler = 1;
6339 else if (!strcmp(errors, "replace"))
6340 known_errorHandler = 2;
6341 else if (!strcmp(errors, "ignore"))
6342 known_errorHandler = 3;
6343 else if (!strcmp(errors, "xmlcharrefreplace"))
6344 known_errorHandler = 4;
6345 else
6346 known_errorHandler = 0;
6347 }
6348 switch (known_errorHandler) {
6349 case 1: /* strict */
6350 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6351 goto onError;
6352 case 2: /* replace */
6353 while (collstart++<collend)
6354 *str++ = '?'; /* fall through */
6355 case 3: /* ignore */
6356 p = collend;
6357 break;
6358 case 4: /* xmlcharrefreplace */
6359 respos = str - PyBytes_AS_STRING(res);
6360 /* determine replacement size (temporarily (mis)uses p) */
6361 for (p = collstart, repsize = 0; p < collend; ++p) {
6362 if (*p<10)
6363 repsize += 2+1+1;
6364 else if (*p<100)
6365 repsize += 2+2+1;
6366 else if (*p<1000)
6367 repsize += 2+3+1;
6368 else if (*p<10000)
6369 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006370#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 else
6372 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006373#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 else if (*p<100000)
6375 repsize += 2+5+1;
6376 else if (*p<1000000)
6377 repsize += 2+6+1;
6378 else
6379 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006380#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 }
6382 requiredsize = respos+repsize+(endp-collend);
6383 if (requiredsize > ressize) {
6384 if (requiredsize<2*ressize)
6385 requiredsize = 2*ressize;
6386 if (_PyBytes_Resize(&res, requiredsize))
6387 goto onError;
6388 str = PyBytes_AS_STRING(res) + respos;
6389 ressize = requiredsize;
6390 }
6391 /* generate replacement (temporarily (mis)uses p) */
6392 for (p = collstart; p < collend; ++p) {
6393 str += sprintf(str, "&#%d;", (int)*p);
6394 }
6395 p = collend;
6396 break;
6397 default:
6398 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6399 encoding, reason, startp, size, &exc,
6400 collstart-startp, collend-startp, &newpos);
6401 if (repunicode == NULL)
6402 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006403 if (PyBytes_Check(repunicode)) {
6404 /* Directly copy bytes result to output. */
6405 repsize = PyBytes_Size(repunicode);
6406 if (repsize > 1) {
6407 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006408 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006409 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6410 Py_DECREF(repunicode);
6411 goto onError;
6412 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006413 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006414 ressize += repsize-1;
6415 }
6416 memcpy(str, PyBytes_AsString(repunicode), repsize);
6417 str += repsize;
6418 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006419 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006420 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006421 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 /* need more space? (at least enough for what we
6423 have+the replacement+the rest of the string, so
6424 we won't have to check space for encodable characters) */
6425 respos = str - PyBytes_AS_STRING(res);
6426 repsize = PyUnicode_GET_SIZE(repunicode);
6427 requiredsize = respos+repsize+(endp-collend);
6428 if (requiredsize > ressize) {
6429 if (requiredsize<2*ressize)
6430 requiredsize = 2*ressize;
6431 if (_PyBytes_Resize(&res, requiredsize)) {
6432 Py_DECREF(repunicode);
6433 goto onError;
6434 }
6435 str = PyBytes_AS_STRING(res) + respos;
6436 ressize = requiredsize;
6437 }
6438 /* check if there is anything unencodable in the replacement
6439 and copy it to the output */
6440 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6441 c = *uni2;
6442 if (c >= limit) {
6443 raise_encode_exception(&exc, encoding, startp, size,
6444 unicodepos, unicodepos+1, reason);
6445 Py_DECREF(repunicode);
6446 goto onError;
6447 }
6448 *str = (char)c;
6449 }
6450 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006451 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006453 }
6454 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006455 /* Resize if we allocated to much */
6456 size = str - PyBytes_AS_STRING(res);
6457 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006458 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006459 if (_PyBytes_Resize(&res, size) < 0)
6460 goto onError;
6461 }
6462
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 Py_XDECREF(errorHandler);
6464 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006465 return res;
6466
6467 onError:
6468 Py_XDECREF(res);
6469 Py_XDECREF(errorHandler);
6470 Py_XDECREF(exc);
6471 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006472}
6473
Alexander Belopolsky40018472011-02-26 01:02:56 +00006474PyObject *
6475PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006476 Py_ssize_t size,
6477 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006479 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480}
6481
Alexander Belopolsky40018472011-02-26 01:02:56 +00006482PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006483_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484{
6485 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 PyErr_BadArgument();
6487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006489 if (PyUnicode_READY(unicode) == -1)
6490 return NULL;
6491 /* Fast path: if it is a one-byte string, construct
6492 bytes object directly. */
6493 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6494 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6495 PyUnicode_GET_LENGTH(unicode));
6496 /* Non-Latin-1 characters present. Defer to above function to
6497 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006500 errors);
6501}
6502
6503PyObject*
6504PyUnicode_AsLatin1String(PyObject *unicode)
6505{
6506 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507}
6508
6509/* --- 7-bit ASCII Codec -------------------------------------------------- */
6510
Alexander Belopolsky40018472011-02-26 01:02:56 +00006511PyObject *
6512PyUnicode_DecodeASCII(const char *s,
6513 Py_ssize_t size,
6514 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006518 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006519 Py_ssize_t startinpos;
6520 Py_ssize_t endinpos;
6521 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006522 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006523 int has_error;
6524 const unsigned char *p = (const unsigned char *)s;
6525 const unsigned char *end = p + size;
6526 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006527 PyObject *errorHandler = NULL;
6528 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006529
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006531 if (size == 1 && (unsigned char)s[0] < 128)
6532 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006533
Victor Stinner702c7342011-10-05 13:50:52 +02006534 has_error = 0;
6535 while (p < end && !has_error) {
6536 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6537 an explanation. */
6538 if (!((size_t) p & LONG_PTR_MASK)) {
6539 /* Help register allocation */
6540 register const unsigned char *_p = p;
6541 while (_p < aligned_end) {
6542 unsigned long value = *(unsigned long *) _p;
6543 if (value & ASCII_CHAR_MASK) {
6544 has_error = 1;
6545 break;
6546 }
6547 _p += SIZEOF_LONG;
6548 }
6549 if (_p == end)
6550 break;
6551 if (has_error)
6552 break;
6553 p = _p;
6554 }
6555 if (*p & 0x80) {
6556 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006557 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006558 }
6559 else {
6560 ++p;
6561 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006562 }
Victor Stinner702c7342011-10-05 13:50:52 +02006563 if (!has_error)
6564 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006565
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 v = _PyUnicode_New(size);
6567 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006571 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006572 e = s + size;
6573 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 register unsigned char c = (unsigned char)*s;
6575 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006576 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 ++s;
6578 }
6579 else {
6580 startinpos = s-starts;
6581 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006582 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 if (unicode_decode_call_errorhandler(
6584 errors, &errorHandler,
6585 "ascii", "ordinal not in range(128)",
6586 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006587 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 goto onError;
6589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 }
Victor Stinner702c7342011-10-05 13:50:52 +02006591 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6592 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006594 Py_XDECREF(errorHandler);
6595 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006596#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006597 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006598 Py_DECREF(v);
6599 return NULL;
6600 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006601#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006603
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606 Py_XDECREF(errorHandler);
6607 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 return NULL;
6609}
6610
Alexander Belopolsky40018472011-02-26 01:02:56 +00006611PyObject *
6612PyUnicode_EncodeASCII(const Py_UNICODE *p,
6613 Py_ssize_t size,
6614 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006616 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617}
6618
Alexander Belopolsky40018472011-02-26 01:02:56 +00006619PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006620_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621{
6622 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 PyErr_BadArgument();
6624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006626 if (PyUnicode_READY(unicode) == -1)
6627 return NULL;
6628 /* Fast path: if it is an ASCII-only string, construct bytes object
6629 directly. Else defer to above function to raise the exception. */
6630 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6631 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6632 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006635 errors);
6636}
6637
6638PyObject *
6639PyUnicode_AsASCIIString(PyObject *unicode)
6640{
6641 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642}
6643
Victor Stinner99b95382011-07-04 14:23:54 +02006644#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006645
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006646/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006647
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006648#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006649#define NEED_RETRY
6650#endif
6651
6652/* XXX This code is limited to "true" double-byte encodings, as
6653 a) it assumes an incomplete character consists of a single byte, and
6654 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006656
Alexander Belopolsky40018472011-02-26 01:02:56 +00006657static int
6658is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006659{
6660 const char *curr = s + offset;
6661
6662 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 const char *prev = CharPrev(s, curr);
6664 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006665 }
6666 return 0;
6667}
6668
6669/*
6670 * Decode MBCS string into unicode object. If 'final' is set, converts
6671 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6672 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006673static int
6674decode_mbcs(PyUnicodeObject **v,
6675 const char *s, /* MBCS string */
6676 int size, /* sizeof MBCS string */
6677 int final,
6678 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006679{
6680 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006681 Py_ssize_t n;
6682 DWORD usize;
6683 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006684
6685 assert(size >= 0);
6686
Victor Stinner554f3f02010-06-16 23:33:54 +00006687 /* check and handle 'errors' arg */
6688 if (errors==NULL || strcmp(errors, "strict")==0)
6689 flags = MB_ERR_INVALID_CHARS;
6690 else if (strcmp(errors, "ignore")==0)
6691 flags = 0;
6692 else {
6693 PyErr_Format(PyExc_ValueError,
6694 "mbcs encoding does not support errors='%s'",
6695 errors);
6696 return -1;
6697 }
6698
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006699 /* Skip trailing lead-byte unless 'final' is set */
6700 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006702
6703 /* First get the size of the result */
6704 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006705 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6706 if (usize==0)
6707 goto mbcs_decode_error;
6708 } else
6709 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006710
6711 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006712 /* Create unicode object */
6713 *v = _PyUnicode_New(usize);
6714 if (*v == NULL)
6715 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006716 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006717 }
6718 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 /* Extend unicode object */
6720 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006721 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006723 }
6724
6725 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006726 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006728 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6729 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006731 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006732 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006733
6734mbcs_decode_error:
6735 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6736 we raise a UnicodeDecodeError - else it is a 'generic'
6737 windows error
6738 */
6739 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6740 /* Ideally, we should get reason from FormatMessage - this
6741 is the Windows 2000 English version of the message
6742 */
6743 PyObject *exc = NULL;
6744 const char *reason = "No mapping for the Unicode character exists "
6745 "in the target multi-byte code page.";
6746 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6747 if (exc != NULL) {
6748 PyCodec_StrictErrors(exc);
6749 Py_DECREF(exc);
6750 }
6751 } else {
6752 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6753 }
6754 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006755}
6756
Alexander Belopolsky40018472011-02-26 01:02:56 +00006757PyObject *
6758PyUnicode_DecodeMBCSStateful(const char *s,
6759 Py_ssize_t size,
6760 const char *errors,
6761 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006762{
6763 PyUnicodeObject *v = NULL;
6764 int done;
6765
6766 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006768
6769#ifdef NEED_RETRY
6770 retry:
6771 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006772 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006773 else
6774#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006775 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006776
6777 if (done < 0) {
6778 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006780 }
6781
6782 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006784
6785#ifdef NEED_RETRY
6786 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 s += done;
6788 size -= done;
6789 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006790 }
6791#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006792#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006793 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006794 Py_DECREF(v);
6795 return NULL;
6796 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006797#endif
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798 return (PyObject *)v;
6799}
6800
Alexander Belopolsky40018472011-02-26 01:02:56 +00006801PyObject *
6802PyUnicode_DecodeMBCS(const char *s,
6803 Py_ssize_t size,
6804 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006805{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6807}
6808
6809/*
6810 * Convert unicode into string object (MBCS).
6811 * Returns 0 if succeed, -1 otherwise.
6812 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006813static int
6814encode_mbcs(PyObject **repr,
6815 const Py_UNICODE *p, /* unicode */
6816 int size, /* size of unicode */
6817 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006818{
Victor Stinner554f3f02010-06-16 23:33:54 +00006819 BOOL usedDefaultChar = FALSE;
6820 BOOL *pusedDefaultChar;
6821 int mbcssize;
6822 Py_ssize_t n;
6823 PyObject *exc = NULL;
6824 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825
6826 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006827
Victor Stinner554f3f02010-06-16 23:33:54 +00006828 /* check and handle 'errors' arg */
6829 if (errors==NULL || strcmp(errors, "strict")==0) {
6830 flags = WC_NO_BEST_FIT_CHARS;
6831 pusedDefaultChar = &usedDefaultChar;
6832 } else if (strcmp(errors, "replace")==0) {
6833 flags = 0;
6834 pusedDefaultChar = NULL;
6835 } else {
6836 PyErr_Format(PyExc_ValueError,
6837 "mbcs encoding does not support errors='%s'",
6838 errors);
6839 return -1;
6840 }
6841
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006842 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006844 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6845 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 if (mbcssize == 0) {
6847 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6848 return -1;
6849 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006850 /* If we used a default char, then we failed! */
6851 if (pusedDefaultChar && *pusedDefaultChar)
6852 goto mbcs_encode_error;
6853 } else {
6854 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006855 }
6856
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006857 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 /* Create string object */
6859 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6860 if (*repr == NULL)
6861 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006862 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006863 }
6864 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 /* Extend string object */
6866 n = PyBytes_Size(*repr);
6867 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6868 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869 }
6870
6871 /* Do the conversion */
6872 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006874 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6875 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6877 return -1;
6878 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006879 if (pusedDefaultChar && *pusedDefaultChar)
6880 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006882 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006883
6884mbcs_encode_error:
6885 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6886 Py_XDECREF(exc);
6887 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006888}
6889
Alexander Belopolsky40018472011-02-26 01:02:56 +00006890PyObject *
6891PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6892 Py_ssize_t size,
6893 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006894{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006895 PyObject *repr = NULL;
6896 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006897
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006898#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006900 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006901 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902 else
6903#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006904 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006905
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006906 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 Py_XDECREF(repr);
6908 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006909 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006910
6911#ifdef NEED_RETRY
6912 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 p += INT_MAX;
6914 size -= INT_MAX;
6915 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006916 }
6917#endif
6918
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006919 return repr;
6920}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006921
Alexander Belopolsky40018472011-02-26 01:02:56 +00006922PyObject *
6923PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006924{
6925 if (!PyUnicode_Check(unicode)) {
6926 PyErr_BadArgument();
6927 return NULL;
6928 }
6929 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 PyUnicode_GET_SIZE(unicode),
6931 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006932}
6933
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934#undef NEED_RETRY
6935
Victor Stinner99b95382011-07-04 14:23:54 +02006936#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006937
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938/* --- Character Mapping Codec -------------------------------------------- */
6939
Alexander Belopolsky40018472011-02-26 01:02:56 +00006940PyObject *
6941PyUnicode_DecodeCharmap(const char *s,
6942 Py_ssize_t size,
6943 PyObject *mapping,
6944 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006947 Py_ssize_t startinpos;
6948 Py_ssize_t endinpos;
6949 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006950 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 PyUnicodeObject *v;
6952 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006953 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006954 PyObject *errorHandler = NULL;
6955 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006956 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006957 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006958
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 /* Default to Latin-1 */
6960 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006961 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962
6963 v = _PyUnicode_New(size);
6964 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006969 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006970 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 mapstring = PyUnicode_AS_UNICODE(mapping);
6972 maplen = PyUnicode_GET_SIZE(mapping);
6973 while (s < e) {
6974 unsigned char ch = *s;
6975 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 if (ch < maplen)
6978 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 if (x == 0xfffe) {
6981 /* undefined mapping */
6982 outpos = p-PyUnicode_AS_UNICODE(v);
6983 startinpos = s-starts;
6984 endinpos = startinpos+1;
6985 if (unicode_decode_call_errorhandler(
6986 errors, &errorHandler,
6987 "charmap", "character maps to <undefined>",
6988 &starts, &e, &startinpos, &endinpos, &exc, &s,
6989 &v, &outpos, &p)) {
6990 goto onError;
6991 }
6992 continue;
6993 }
6994 *p++ = x;
6995 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006996 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006997 }
6998 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 while (s < e) {
7000 unsigned char ch = *s;
7001 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007002
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7004 w = PyLong_FromLong((long)ch);
7005 if (w == NULL)
7006 goto onError;
7007 x = PyObject_GetItem(mapping, w);
7008 Py_DECREF(w);
7009 if (x == NULL) {
7010 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7011 /* No mapping found means: mapping is undefined. */
7012 PyErr_Clear();
7013 x = Py_None;
7014 Py_INCREF(x);
7015 } else
7016 goto onError;
7017 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007018
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 /* Apply mapping */
7020 if (PyLong_Check(x)) {
7021 long value = PyLong_AS_LONG(x);
7022 if (value < 0 || value > 65535) {
7023 PyErr_SetString(PyExc_TypeError,
7024 "character mapping must be in range(65536)");
7025 Py_DECREF(x);
7026 goto onError;
7027 }
7028 *p++ = (Py_UNICODE)value;
7029 }
7030 else if (x == Py_None) {
7031 /* undefined mapping */
7032 outpos = p-PyUnicode_AS_UNICODE(v);
7033 startinpos = s-starts;
7034 endinpos = startinpos+1;
7035 if (unicode_decode_call_errorhandler(
7036 errors, &errorHandler,
7037 "charmap", "character maps to <undefined>",
7038 &starts, &e, &startinpos, &endinpos, &exc, &s,
7039 &v, &outpos, &p)) {
7040 Py_DECREF(x);
7041 goto onError;
7042 }
7043 Py_DECREF(x);
7044 continue;
7045 }
7046 else if (PyUnicode_Check(x)) {
7047 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007048
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 if (targetsize == 1)
7050 /* 1-1 mapping */
7051 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007052
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 else if (targetsize > 1) {
7054 /* 1-n mapping */
7055 if (targetsize > extrachars) {
7056 /* resize first */
7057 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7058 Py_ssize_t needed = (targetsize - extrachars) + \
7059 (targetsize << 2);
7060 extrachars += needed;
7061 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007062 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 PyUnicode_GET_SIZE(v) + needed) < 0) {
7064 Py_DECREF(x);
7065 goto onError;
7066 }
7067 p = PyUnicode_AS_UNICODE(v) + oldpos;
7068 }
7069 Py_UNICODE_COPY(p,
7070 PyUnicode_AS_UNICODE(x),
7071 targetsize);
7072 p += targetsize;
7073 extrachars -= targetsize;
7074 }
7075 /* 1-0 mapping: skip the character */
7076 }
7077 else {
7078 /* wrong return value */
7079 PyErr_SetString(PyExc_TypeError,
7080 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007081 Py_DECREF(x);
7082 goto onError;
7083 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 Py_DECREF(x);
7085 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 }
7088 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007089 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007091 Py_XDECREF(errorHandler);
7092 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007093#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007094 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007095 Py_DECREF(v);
7096 return NULL;
7097 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007098#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007100
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007102 Py_XDECREF(errorHandler);
7103 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 Py_XDECREF(v);
7105 return NULL;
7106}
7107
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007108/* Charmap encoding: the lookup table */
7109
Alexander Belopolsky40018472011-02-26 01:02:56 +00007110struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007111 PyObject_HEAD
7112 unsigned char level1[32];
7113 int count2, count3;
7114 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007115};
7116
7117static PyObject*
7118encoding_map_size(PyObject *obj, PyObject* args)
7119{
7120 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007121 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007123}
7124
7125static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007126 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 PyDoc_STR("Return the size (in bytes) of this object") },
7128 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007129};
7130
7131static void
7132encoding_map_dealloc(PyObject* o)
7133{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007134 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007135}
7136
7137static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007138 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 "EncodingMap", /*tp_name*/
7140 sizeof(struct encoding_map), /*tp_basicsize*/
7141 0, /*tp_itemsize*/
7142 /* methods */
7143 encoding_map_dealloc, /*tp_dealloc*/
7144 0, /*tp_print*/
7145 0, /*tp_getattr*/
7146 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007147 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 0, /*tp_repr*/
7149 0, /*tp_as_number*/
7150 0, /*tp_as_sequence*/
7151 0, /*tp_as_mapping*/
7152 0, /*tp_hash*/
7153 0, /*tp_call*/
7154 0, /*tp_str*/
7155 0, /*tp_getattro*/
7156 0, /*tp_setattro*/
7157 0, /*tp_as_buffer*/
7158 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7159 0, /*tp_doc*/
7160 0, /*tp_traverse*/
7161 0, /*tp_clear*/
7162 0, /*tp_richcompare*/
7163 0, /*tp_weaklistoffset*/
7164 0, /*tp_iter*/
7165 0, /*tp_iternext*/
7166 encoding_map_methods, /*tp_methods*/
7167 0, /*tp_members*/
7168 0, /*tp_getset*/
7169 0, /*tp_base*/
7170 0, /*tp_dict*/
7171 0, /*tp_descr_get*/
7172 0, /*tp_descr_set*/
7173 0, /*tp_dictoffset*/
7174 0, /*tp_init*/
7175 0, /*tp_alloc*/
7176 0, /*tp_new*/
7177 0, /*tp_free*/
7178 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007179};
7180
7181PyObject*
7182PyUnicode_BuildEncodingMap(PyObject* string)
7183{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007184 PyObject *result;
7185 struct encoding_map *mresult;
7186 int i;
7187 int need_dict = 0;
7188 unsigned char level1[32];
7189 unsigned char level2[512];
7190 unsigned char *mlevel1, *mlevel2, *mlevel3;
7191 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007192 int kind;
7193 void *data;
7194 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007196 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007197 PyErr_BadArgument();
7198 return NULL;
7199 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007200 kind = PyUnicode_KIND(string);
7201 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007202 memset(level1, 0xFF, sizeof level1);
7203 memset(level2, 0xFF, sizeof level2);
7204
7205 /* If there isn't a one-to-one mapping of NULL to \0,
7206 or if there are non-BMP characters, we need to use
7207 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007208 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007209 need_dict = 1;
7210 for (i = 1; i < 256; i++) {
7211 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007212 ch = PyUnicode_READ(kind, data, i);
7213 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007214 need_dict = 1;
7215 break;
7216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007217 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007218 /* unmapped character */
7219 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007220 l1 = ch >> 11;
7221 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007222 if (level1[l1] == 0xFF)
7223 level1[l1] = count2++;
7224 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007225 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007226 }
7227
7228 if (count2 >= 0xFF || count3 >= 0xFF)
7229 need_dict = 1;
7230
7231 if (need_dict) {
7232 PyObject *result = PyDict_New();
7233 PyObject *key, *value;
7234 if (!result)
7235 return NULL;
7236 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007237 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007238 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007239 if (!key || !value)
7240 goto failed1;
7241 if (PyDict_SetItem(result, key, value) == -1)
7242 goto failed1;
7243 Py_DECREF(key);
7244 Py_DECREF(value);
7245 }
7246 return result;
7247 failed1:
7248 Py_XDECREF(key);
7249 Py_XDECREF(value);
7250 Py_DECREF(result);
7251 return NULL;
7252 }
7253
7254 /* Create a three-level trie */
7255 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7256 16*count2 + 128*count3 - 1);
7257 if (!result)
7258 return PyErr_NoMemory();
7259 PyObject_Init(result, &EncodingMapType);
7260 mresult = (struct encoding_map*)result;
7261 mresult->count2 = count2;
7262 mresult->count3 = count3;
7263 mlevel1 = mresult->level1;
7264 mlevel2 = mresult->level23;
7265 mlevel3 = mresult->level23 + 16*count2;
7266 memcpy(mlevel1, level1, 32);
7267 memset(mlevel2, 0xFF, 16*count2);
7268 memset(mlevel3, 0, 128*count3);
7269 count3 = 0;
7270 for (i = 1; i < 256; i++) {
7271 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007272 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007273 /* unmapped character */
7274 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007275 o1 = PyUnicode_READ(kind, data, i)>>11;
7276 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007277 i2 = 16*mlevel1[o1] + o2;
7278 if (mlevel2[i2] == 0xFF)
7279 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007280 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007281 i3 = 128*mlevel2[i2] + o3;
7282 mlevel3[i3] = i;
7283 }
7284 return result;
7285}
7286
7287static int
7288encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7289{
7290 struct encoding_map *map = (struct encoding_map*)mapping;
7291 int l1 = c>>11;
7292 int l2 = (c>>7) & 0xF;
7293 int l3 = c & 0x7F;
7294 int i;
7295
7296#ifdef Py_UNICODE_WIDE
7297 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007299 }
7300#endif
7301 if (c == 0)
7302 return 0;
7303 /* level 1*/
7304 i = map->level1[l1];
7305 if (i == 0xFF) {
7306 return -1;
7307 }
7308 /* level 2*/
7309 i = map->level23[16*i+l2];
7310 if (i == 0xFF) {
7311 return -1;
7312 }
7313 /* level 3 */
7314 i = map->level23[16*map->count2 + 128*i + l3];
7315 if (i == 0) {
7316 return -1;
7317 }
7318 return i;
7319}
7320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007321/* Lookup the character ch in the mapping. If the character
7322 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007323 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007324static PyObject *
7325charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326{
Christian Heimes217cfd12007-12-02 14:31:20 +00007327 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007328 PyObject *x;
7329
7330 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007332 x = PyObject_GetItem(mapping, w);
7333 Py_DECREF(w);
7334 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7336 /* No mapping found means: mapping is undefined. */
7337 PyErr_Clear();
7338 x = Py_None;
7339 Py_INCREF(x);
7340 return x;
7341 } else
7342 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007344 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007346 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 long value = PyLong_AS_LONG(x);
7348 if (value < 0 || value > 255) {
7349 PyErr_SetString(PyExc_TypeError,
7350 "character mapping must be in range(256)");
7351 Py_DECREF(x);
7352 return NULL;
7353 }
7354 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007356 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 /* wrong return value */
7360 PyErr_Format(PyExc_TypeError,
7361 "character mapping must return integer, bytes or None, not %.400s",
7362 x->ob_type->tp_name);
7363 Py_DECREF(x);
7364 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 }
7366}
7367
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007368static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007369charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007370{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007371 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7372 /* exponentially overallocate to minimize reallocations */
7373 if (requiredsize < 2*outsize)
7374 requiredsize = 2*outsize;
7375 if (_PyBytes_Resize(outobj, requiredsize))
7376 return -1;
7377 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007378}
7379
Benjamin Peterson14339b62009-01-31 16:36:08 +00007380typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007382} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007383/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007384 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007385 space is available. Return a new reference to the object that
7386 was put in the output buffer, or Py_None, if the mapping was undefined
7387 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007388 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007389static charmapencode_result
7390charmapencode_output(Py_UNICODE c, PyObject *mapping,
7391 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007392{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007393 PyObject *rep;
7394 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007395 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007396
Christian Heimes90aa7642007-12-19 02:45:37 +00007397 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007398 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007400 if (res == -1)
7401 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 if (outsize<requiredsize)
7403 if (charmapencode_resize(outobj, outpos, requiredsize))
7404 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007405 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 outstart[(*outpos)++] = (char)res;
7407 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007408 }
7409
7410 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007411 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007413 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 Py_DECREF(rep);
7415 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007416 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 if (PyLong_Check(rep)) {
7418 Py_ssize_t requiredsize = *outpos+1;
7419 if (outsize<requiredsize)
7420 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7421 Py_DECREF(rep);
7422 return enc_EXCEPTION;
7423 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007424 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007426 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 else {
7428 const char *repchars = PyBytes_AS_STRING(rep);
7429 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7430 Py_ssize_t requiredsize = *outpos+repsize;
7431 if (outsize<requiredsize)
7432 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7433 Py_DECREF(rep);
7434 return enc_EXCEPTION;
7435 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007436 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 memcpy(outstart + *outpos, repchars, repsize);
7438 *outpos += repsize;
7439 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007440 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007441 Py_DECREF(rep);
7442 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007443}
7444
7445/* handle an error in PyUnicode_EncodeCharmap
7446 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007447static int
7448charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007449 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007450 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007451 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007452 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007453{
7454 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007455 Py_ssize_t repsize;
7456 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007457 Py_UNICODE *uni2;
7458 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007459 Py_ssize_t collstartpos = *inpos;
7460 Py_ssize_t collendpos = *inpos+1;
7461 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007462 char *encoding = "charmap";
7463 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007464 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007465
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007466 /* find all unencodable characters */
7467 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007468 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007469 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 int res = encoding_map_lookup(p[collendpos], mapping);
7471 if (res != -1)
7472 break;
7473 ++collendpos;
7474 continue;
7475 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007476
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 rep = charmapencode_lookup(p[collendpos], mapping);
7478 if (rep==NULL)
7479 return -1;
7480 else if (rep!=Py_None) {
7481 Py_DECREF(rep);
7482 break;
7483 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007484 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007486 }
7487 /* cache callback name lookup
7488 * (if not done yet, i.e. it's the first error) */
7489 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 if ((errors==NULL) || (!strcmp(errors, "strict")))
7491 *known_errorHandler = 1;
7492 else if (!strcmp(errors, "replace"))
7493 *known_errorHandler = 2;
7494 else if (!strcmp(errors, "ignore"))
7495 *known_errorHandler = 3;
7496 else if (!strcmp(errors, "xmlcharrefreplace"))
7497 *known_errorHandler = 4;
7498 else
7499 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007500 }
7501 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007502 case 1: /* strict */
7503 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7504 return -1;
7505 case 2: /* replace */
7506 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 x = charmapencode_output('?', mapping, res, respos);
7508 if (x==enc_EXCEPTION) {
7509 return -1;
7510 }
7511 else if (x==enc_FAILED) {
7512 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7513 return -1;
7514 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007515 }
7516 /* fall through */
7517 case 3: /* ignore */
7518 *inpos = collendpos;
7519 break;
7520 case 4: /* xmlcharrefreplace */
7521 /* generate replacement (temporarily (mis)uses p) */
7522 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 char buffer[2+29+1+1];
7524 char *cp;
7525 sprintf(buffer, "&#%d;", (int)p[collpos]);
7526 for (cp = buffer; *cp; ++cp) {
7527 x = charmapencode_output(*cp, mapping, res, respos);
7528 if (x==enc_EXCEPTION)
7529 return -1;
7530 else if (x==enc_FAILED) {
7531 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7532 return -1;
7533 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007534 }
7535 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007536 *inpos = collendpos;
7537 break;
7538 default:
7539 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007540 encoding, reason, p, size, exceptionObject,
7541 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007542 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007544 if (PyBytes_Check(repunicode)) {
7545 /* Directly copy bytes result to output. */
7546 Py_ssize_t outsize = PyBytes_Size(*res);
7547 Py_ssize_t requiredsize;
7548 repsize = PyBytes_Size(repunicode);
7549 requiredsize = *respos + repsize;
7550 if (requiredsize > outsize)
7551 /* Make room for all additional bytes. */
7552 if (charmapencode_resize(res, respos, requiredsize)) {
7553 Py_DECREF(repunicode);
7554 return -1;
7555 }
7556 memcpy(PyBytes_AsString(*res) + *respos,
7557 PyBytes_AsString(repunicode), repsize);
7558 *respos += repsize;
7559 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007560 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007561 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007562 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007563 /* generate replacement */
7564 repsize = PyUnicode_GET_SIZE(repunicode);
7565 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 x = charmapencode_output(*uni2, mapping, res, respos);
7567 if (x==enc_EXCEPTION) {
7568 return -1;
7569 }
7570 else if (x==enc_FAILED) {
7571 Py_DECREF(repunicode);
7572 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7573 return -1;
7574 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007575 }
7576 *inpos = newpos;
7577 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007578 }
7579 return 0;
7580}
7581
Alexander Belopolsky40018472011-02-26 01:02:56 +00007582PyObject *
7583PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7584 Py_ssize_t size,
7585 PyObject *mapping,
7586 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007588 /* output object */
7589 PyObject *res = NULL;
7590 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007591 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007592 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007593 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007594 PyObject *errorHandler = NULL;
7595 PyObject *exc = NULL;
7596 /* the following variable is used for caching string comparisons
7597 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7598 * 3=ignore, 4=xmlcharrefreplace */
7599 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600
7601 /* Default to Latin-1 */
7602 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007605 /* allocate enough for a simple encoding without
7606 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007607 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007608 if (res == NULL)
7609 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007610 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007613 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 /* try to encode it */
7615 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7616 if (x==enc_EXCEPTION) /* error */
7617 goto onError;
7618 if (x==enc_FAILED) { /* unencodable character */
7619 if (charmap_encoding_error(p, size, &inpos, mapping,
7620 &exc,
7621 &known_errorHandler, &errorHandler, errors,
7622 &res, &respos)) {
7623 goto onError;
7624 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007625 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 else
7627 /* done with this character => adjust input position */
7628 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007631 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007632 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007633 if (_PyBytes_Resize(&res, respos) < 0)
7634 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007635
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007636 Py_XDECREF(exc);
7637 Py_XDECREF(errorHandler);
7638 return res;
7639
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007641 Py_XDECREF(res);
7642 Py_XDECREF(exc);
7643 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 return NULL;
7645}
7646
Alexander Belopolsky40018472011-02-26 01:02:56 +00007647PyObject *
7648PyUnicode_AsCharmapString(PyObject *unicode,
7649 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650{
7651 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007652 PyErr_BadArgument();
7653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654 }
7655 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 PyUnicode_GET_SIZE(unicode),
7657 mapping,
7658 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659}
7660
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007662static void
7663make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007664 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007665 Py_ssize_t startpos, Py_ssize_t endpos,
7666 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007668 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007669 *exceptionObject = _PyUnicodeTranslateError_Create(
7670 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671 }
7672 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7674 goto onError;
7675 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7676 goto onError;
7677 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7678 goto onError;
7679 return;
7680 onError:
7681 Py_DECREF(*exceptionObject);
7682 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683 }
7684}
7685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007686/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007687static void
7688raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007689 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007690 Py_ssize_t startpos, Py_ssize_t endpos,
7691 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007692{
7693 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007694 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007695 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007697}
7698
7699/* error handling callback helper:
7700 build arguments, call the callback and check the arguments,
7701 put the result into newpos and return the replacement string, which
7702 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007703static PyObject *
7704unicode_translate_call_errorhandler(const char *errors,
7705 PyObject **errorHandler,
7706 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007707 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007708 Py_ssize_t startpos, Py_ssize_t endpos,
7709 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007711 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007712
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007713 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007714 PyObject *restuple;
7715 PyObject *resunicode;
7716
7717 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007719 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007720 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007721 }
7722
7723 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007724 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007725 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007727
7728 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007730 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007732 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007733 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 Py_DECREF(restuple);
7735 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007736 }
7737 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 &resunicode, &i_newpos)) {
7739 Py_DECREF(restuple);
7740 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007741 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007742 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007743 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007744 else
7745 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007746 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7748 Py_DECREF(restuple);
7749 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007750 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007751 Py_INCREF(resunicode);
7752 Py_DECREF(restuple);
7753 return resunicode;
7754}
7755
7756/* Lookup the character ch in the mapping and put the result in result,
7757 which must be decrefed by the caller.
7758 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007759static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007760charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007761{
Christian Heimes217cfd12007-12-02 14:31:20 +00007762 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007763 PyObject *x;
7764
7765 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007767 x = PyObject_GetItem(mapping, w);
7768 Py_DECREF(w);
7769 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7771 /* No mapping found means: use 1:1 mapping. */
7772 PyErr_Clear();
7773 *result = NULL;
7774 return 0;
7775 } else
7776 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007777 }
7778 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 *result = x;
7780 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007781 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007782 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 long value = PyLong_AS_LONG(x);
7784 long max = PyUnicode_GetMax();
7785 if (value < 0 || value > max) {
7786 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007787 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 Py_DECREF(x);
7789 return -1;
7790 }
7791 *result = x;
7792 return 0;
7793 }
7794 else if (PyUnicode_Check(x)) {
7795 *result = x;
7796 return 0;
7797 }
7798 else {
7799 /* wrong return value */
7800 PyErr_SetString(PyExc_TypeError,
7801 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007802 Py_DECREF(x);
7803 return -1;
7804 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007805}
7806/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 if not reallocate and adjust various state variables.
7808 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007809static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007810charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007813 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007814 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 /* exponentially overallocate to minimize reallocations */
7816 if (requiredsize < 2 * oldsize)
7817 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007818 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7819 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007821 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007822 }
7823 return 0;
7824}
7825/* lookup the character, put the result in the output string and adjust
7826 various state variables. Return a new reference to the object that
7827 was put in the output buffer in *result, or Py_None, if the mapping was
7828 undefined (in which case no character was written).
7829 The called must decref result.
7830 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007831static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007832charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7833 PyObject *mapping, Py_UCS4 **output,
7834 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007835 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007836{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007837 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7838 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007840 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007842 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007843 }
7844 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007846 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007849 }
7850 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 Py_ssize_t repsize;
7852 if (PyUnicode_READY(*res) == -1)
7853 return -1;
7854 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 if (repsize==1) {
7856 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007857 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 }
7859 else if (repsize!=0) {
7860 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 Py_ssize_t requiredsize = *opos +
7862 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007864 Py_ssize_t i;
7865 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007867 for(i = 0; i < repsize; i++)
7868 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007870 }
7871 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007873 return 0;
7874}
7875
Alexander Belopolsky40018472011-02-26 01:02:56 +00007876PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877_PyUnicode_TranslateCharmap(PyObject *input,
7878 PyObject *mapping,
7879 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 /* input object */
7882 char *idata;
7883 Py_ssize_t size, i;
7884 int kind;
7885 /* output buffer */
7886 Py_UCS4 *output = NULL;
7887 Py_ssize_t osize;
7888 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007889 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007890 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007891 char *reason = "character maps to <undefined>";
7892 PyObject *errorHandler = NULL;
7893 PyObject *exc = NULL;
7894 /* the following variable is used for caching string comparisons
7895 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7896 * 3=ignore, 4=xmlcharrefreplace */
7897 int known_errorHandler = -1;
7898
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 PyErr_BadArgument();
7901 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 if (PyUnicode_READY(input) == -1)
7905 return NULL;
7906 idata = (char*)PyUnicode_DATA(input);
7907 kind = PyUnicode_KIND(input);
7908 size = PyUnicode_GET_LENGTH(input);
7909 i = 0;
7910
7911 if (size == 0) {
7912 Py_INCREF(input);
7913 return input;
7914 }
7915
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007916 /* allocate enough for a simple 1:1 translation without
7917 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007918 osize = size;
7919 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7920 opos = 0;
7921 if (output == NULL) {
7922 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007926 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 /* try to encode it */
7928 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007929 if (charmaptranslate_output(input, i, mapping,
7930 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 Py_XDECREF(x);
7932 goto onError;
7933 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007934 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007936 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 else { /* untranslatable character */
7938 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7939 Py_ssize_t repsize;
7940 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007941 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007943 Py_ssize_t collstart = i;
7944 Py_ssize_t collend = i+1;
7945 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007948 while (collend < size) {
7949 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 goto onError;
7951 Py_XDECREF(x);
7952 if (x!=Py_None)
7953 break;
7954 ++collend;
7955 }
7956 /* cache callback name lookup
7957 * (if not done yet, i.e. it's the first error) */
7958 if (known_errorHandler==-1) {
7959 if ((errors==NULL) || (!strcmp(errors, "strict")))
7960 known_errorHandler = 1;
7961 else if (!strcmp(errors, "replace"))
7962 known_errorHandler = 2;
7963 else if (!strcmp(errors, "ignore"))
7964 known_errorHandler = 3;
7965 else if (!strcmp(errors, "xmlcharrefreplace"))
7966 known_errorHandler = 4;
7967 else
7968 known_errorHandler = 0;
7969 }
7970 switch (known_errorHandler) {
7971 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007972 raise_translate_exception(&exc, input, collstart,
7973 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007974 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 case 2: /* replace */
7976 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007977 for (coll = collstart; coll<collend; coll++)
7978 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 /* fall through */
7980 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 break;
7983 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007984 /* generate replacement (temporarily (mis)uses i) */
7985 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 char buffer[2+29+1+1];
7987 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007988 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7989 if (charmaptranslate_makespace(&output, &osize,
7990 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 goto onError;
7992 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007993 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007995 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 break;
7997 default:
7998 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007999 reason, input, &exc,
8000 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008001 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 goto onError;
8003 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008004 repsize = PyUnicode_GET_LENGTH(repunicode);
8005 if (charmaptranslate_makespace(&output, &osize,
8006 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 Py_DECREF(repunicode);
8008 goto onError;
8009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008010 for (uni2 = 0; repsize-->0; ++uni2)
8011 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8012 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008014 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008015 }
8016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008017 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8018 if (!res)
8019 goto onError;
8020 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008021 Py_XDECREF(exc);
8022 Py_XDECREF(errorHandler);
8023 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008027 Py_XDECREF(exc);
8028 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029 return NULL;
8030}
8031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008032/* Deprecated. Use PyUnicode_Translate instead. */
8033PyObject *
8034PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8035 Py_ssize_t size,
8036 PyObject *mapping,
8037 const char *errors)
8038{
8039 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8040 if (!unicode)
8041 return NULL;
8042 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8043}
8044
Alexander Belopolsky40018472011-02-26 01:02:56 +00008045PyObject *
8046PyUnicode_Translate(PyObject *str,
8047 PyObject *mapping,
8048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049{
8050 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008051
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 str = PyUnicode_FromObject(str);
8053 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008055 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056 Py_DECREF(str);
8057 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008058
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 Py_XDECREF(str);
8061 return NULL;
8062}
Tim Petersced69f82003-09-16 20:30:58 +00008063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008064static Py_UCS4
8065fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
8066{
8067 /* No need to call PyUnicode_READY(self) because this function is only
8068 called as a callback from fixup() which does it already. */
8069 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8070 const int kind = PyUnicode_KIND(self);
8071 void *data = PyUnicode_DATA(self);
8072 Py_UCS4 maxchar = 0, ch, fixed;
8073 Py_ssize_t i;
8074
8075 for (i = 0; i < len; ++i) {
8076 ch = PyUnicode_READ(kind, data, i);
8077 fixed = 0;
8078 if (ch > 127) {
8079 if (Py_UNICODE_ISSPACE(ch))
8080 fixed = ' ';
8081 else {
8082 const int decimal = Py_UNICODE_TODECIMAL(ch);
8083 if (decimal >= 0)
8084 fixed = '0' + decimal;
8085 }
8086 if (fixed != 0) {
8087 if (fixed > maxchar)
8088 maxchar = fixed;
8089 PyUnicode_WRITE(kind, data, i, fixed);
8090 }
8091 else if (ch > maxchar)
8092 maxchar = ch;
8093 }
8094 else if (ch > maxchar)
8095 maxchar = ch;
8096 }
8097
8098 return maxchar;
8099}
8100
8101PyObject *
8102_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8103{
8104 if (!PyUnicode_Check(unicode)) {
8105 PyErr_BadInternalCall();
8106 return NULL;
8107 }
8108 if (PyUnicode_READY(unicode) == -1)
8109 return NULL;
8110 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8111 /* If the string is already ASCII, just return the same string */
8112 Py_INCREF(unicode);
8113 return unicode;
8114 }
8115 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8116}
8117
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008118PyObject *
8119PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8120 Py_ssize_t length)
8121{
8122 PyObject *result;
8123 Py_UNICODE *p; /* write pointer into result */
8124 Py_ssize_t i;
8125 /* Copy to a new string */
8126 result = (PyObject *)_PyUnicode_New(length);
8127 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8128 if (result == NULL)
8129 return result;
8130 p = PyUnicode_AS_UNICODE(result);
8131 /* Iterate over code points */
8132 for (i = 0; i < length; i++) {
8133 Py_UNICODE ch =s[i];
8134 if (ch > 127) {
8135 int decimal = Py_UNICODE_TODECIMAL(ch);
8136 if (decimal >= 0)
8137 p[i] = '0' + decimal;
8138 }
8139 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008140#ifndef DONT_MAKE_RESULT_READY
8141 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008142 Py_DECREF(result);
8143 return NULL;
8144 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008145#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008146 return result;
8147}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008148/* --- Decimal Encoder ---------------------------------------------------- */
8149
Alexander Belopolsky40018472011-02-26 01:02:56 +00008150int
8151PyUnicode_EncodeDecimal(Py_UNICODE *s,
8152 Py_ssize_t length,
8153 char *output,
8154 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008155{
8156 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008157 PyObject *errorHandler = NULL;
8158 PyObject *exc = NULL;
8159 const char *encoding = "decimal";
8160 const char *reason = "invalid decimal Unicode string";
8161 /* the following variable is used for caching string comparisons
8162 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8163 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008164
8165 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 PyErr_BadArgument();
8167 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008168 }
8169
8170 p = s;
8171 end = s + length;
8172 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 register Py_UNICODE ch = *p;
8174 int decimal;
8175 PyObject *repunicode;
8176 Py_ssize_t repsize;
8177 Py_ssize_t newpos;
8178 Py_UNICODE *uni2;
8179 Py_UNICODE *collstart;
8180 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008181
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 ++p;
8185 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 decimal = Py_UNICODE_TODECIMAL(ch);
8188 if (decimal >= 0) {
8189 *output++ = '0' + decimal;
8190 ++p;
8191 continue;
8192 }
8193 if (0 < ch && ch < 256) {
8194 *output++ = (char)ch;
8195 ++p;
8196 continue;
8197 }
8198 /* All other characters are considered unencodable */
8199 collstart = p;
8200 collend = p+1;
8201 while (collend < end) {
8202 if ((0 < *collend && *collend < 256) ||
8203 !Py_UNICODE_ISSPACE(*collend) ||
8204 Py_UNICODE_TODECIMAL(*collend))
8205 break;
8206 }
8207 /* cache callback name lookup
8208 * (if not done yet, i.e. it's the first error) */
8209 if (known_errorHandler==-1) {
8210 if ((errors==NULL) || (!strcmp(errors, "strict")))
8211 known_errorHandler = 1;
8212 else if (!strcmp(errors, "replace"))
8213 known_errorHandler = 2;
8214 else if (!strcmp(errors, "ignore"))
8215 known_errorHandler = 3;
8216 else if (!strcmp(errors, "xmlcharrefreplace"))
8217 known_errorHandler = 4;
8218 else
8219 known_errorHandler = 0;
8220 }
8221 switch (known_errorHandler) {
8222 case 1: /* strict */
8223 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8224 goto onError;
8225 case 2: /* replace */
8226 for (p = collstart; p < collend; ++p)
8227 *output++ = '?';
8228 /* fall through */
8229 case 3: /* ignore */
8230 p = collend;
8231 break;
8232 case 4: /* xmlcharrefreplace */
8233 /* generate replacement (temporarily (mis)uses p) */
8234 for (p = collstart; p < collend; ++p)
8235 output += sprintf(output, "&#%d;", (int)*p);
8236 p = collend;
8237 break;
8238 default:
8239 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8240 encoding, reason, s, length, &exc,
8241 collstart-s, collend-s, &newpos);
8242 if (repunicode == NULL)
8243 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008244 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008245 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008246 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8247 Py_DECREF(repunicode);
8248 goto onError;
8249 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 /* generate replacement */
8251 repsize = PyUnicode_GET_SIZE(repunicode);
8252 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8253 Py_UNICODE ch = *uni2;
8254 if (Py_UNICODE_ISSPACE(ch))
8255 *output++ = ' ';
8256 else {
8257 decimal = Py_UNICODE_TODECIMAL(ch);
8258 if (decimal >= 0)
8259 *output++ = '0' + decimal;
8260 else if (0 < ch && ch < 256)
8261 *output++ = (char)ch;
8262 else {
8263 Py_DECREF(repunicode);
8264 raise_encode_exception(&exc, encoding,
8265 s, length, collstart-s, collend-s, reason);
8266 goto onError;
8267 }
8268 }
8269 }
8270 p = s + newpos;
8271 Py_DECREF(repunicode);
8272 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008273 }
8274 /* 0-terminate the output string */
8275 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 Py_XDECREF(exc);
8277 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008278 return 0;
8279
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008281 Py_XDECREF(exc);
8282 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008283 return -1;
8284}
8285
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286/* --- Helpers ------------------------------------------------------------ */
8287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008288#include "stringlib/ucs1lib.h"
8289#include "stringlib/fastsearch.h"
8290#include "stringlib/partition.h"
8291#include "stringlib/split.h"
8292#include "stringlib/count.h"
8293#include "stringlib/find.h"
8294#include "stringlib/localeutil.h"
8295#include "stringlib/undef.h"
8296
8297#include "stringlib/ucs2lib.h"
8298#include "stringlib/fastsearch.h"
8299#include "stringlib/partition.h"
8300#include "stringlib/split.h"
8301#include "stringlib/count.h"
8302#include "stringlib/find.h"
8303#include "stringlib/localeutil.h"
8304#include "stringlib/undef.h"
8305
8306#include "stringlib/ucs4lib.h"
8307#include "stringlib/fastsearch.h"
8308#include "stringlib/partition.h"
8309#include "stringlib/split.h"
8310#include "stringlib/count.h"
8311#include "stringlib/find.h"
8312#include "stringlib/localeutil.h"
8313#include "stringlib/undef.h"
8314
8315static Py_ssize_t
8316any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8317 const Py_UCS1*, Py_ssize_t,
8318 Py_ssize_t, Py_ssize_t),
8319 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8320 const Py_UCS2*, Py_ssize_t,
8321 Py_ssize_t, Py_ssize_t),
8322 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8323 const Py_UCS4*, Py_ssize_t,
8324 Py_ssize_t, Py_ssize_t),
8325 PyObject* s1, PyObject* s2,
8326 Py_ssize_t start,
8327 Py_ssize_t end)
8328{
8329 int kind1, kind2, kind;
8330 void *buf1, *buf2;
8331 Py_ssize_t len1, len2, result;
8332
8333 kind1 = PyUnicode_KIND(s1);
8334 kind2 = PyUnicode_KIND(s2);
8335 kind = kind1 > kind2 ? kind1 : kind2;
8336 buf1 = PyUnicode_DATA(s1);
8337 buf2 = PyUnicode_DATA(s2);
8338 if (kind1 != kind)
8339 buf1 = _PyUnicode_AsKind(s1, kind);
8340 if (!buf1)
8341 return -2;
8342 if (kind2 != kind)
8343 buf2 = _PyUnicode_AsKind(s2, kind);
8344 if (!buf2) {
8345 if (kind1 != kind) PyMem_Free(buf1);
8346 return -2;
8347 }
8348 len1 = PyUnicode_GET_LENGTH(s1);
8349 len2 = PyUnicode_GET_LENGTH(s2);
8350
8351 switch(kind) {
8352 case PyUnicode_1BYTE_KIND:
8353 result = ucs1(buf1, len1, buf2, len2, start, end);
8354 break;
8355 case PyUnicode_2BYTE_KIND:
8356 result = ucs2(buf1, len1, buf2, len2, start, end);
8357 break;
8358 case PyUnicode_4BYTE_KIND:
8359 result = ucs4(buf1, len1, buf2, len2, start, end);
8360 break;
8361 default:
8362 assert(0); result = -2;
8363 }
8364
8365 if (kind1 != kind)
8366 PyMem_Free(buf1);
8367 if (kind2 != kind)
8368 PyMem_Free(buf2);
8369
8370 return result;
8371}
8372
8373Py_ssize_t
8374_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8375 Py_ssize_t n_buffer,
8376 void *digits, Py_ssize_t n_digits,
8377 Py_ssize_t min_width,
8378 const char *grouping,
8379 const char *thousands_sep)
8380{
8381 switch(kind) {
8382 case PyUnicode_1BYTE_KIND:
8383 return _PyUnicode_ucs1_InsertThousandsGrouping(
8384 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8385 min_width, grouping, thousands_sep);
8386 case PyUnicode_2BYTE_KIND:
8387 return _PyUnicode_ucs2_InsertThousandsGrouping(
8388 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8389 min_width, grouping, thousands_sep);
8390 case PyUnicode_4BYTE_KIND:
8391 return _PyUnicode_ucs4_InsertThousandsGrouping(
8392 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8393 min_width, grouping, thousands_sep);
8394 }
8395 assert(0);
8396 return -1;
8397}
8398
8399
Eric Smith8c663262007-08-25 02:26:07 +00008400#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008401#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008402
Thomas Wouters477c8d52006-05-27 19:21:47 +00008403#include "stringlib/count.h"
8404#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008405
Thomas Wouters477c8d52006-05-27 19:21:47 +00008406/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008407#define ADJUST_INDICES(start, end, len) \
8408 if (end > len) \
8409 end = len; \
8410 else if (end < 0) { \
8411 end += len; \
8412 if (end < 0) \
8413 end = 0; \
8414 } \
8415 if (start < 0) { \
8416 start += len; \
8417 if (start < 0) \
8418 start = 0; \
8419 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008420
Alexander Belopolsky40018472011-02-26 01:02:56 +00008421Py_ssize_t
8422PyUnicode_Count(PyObject *str,
8423 PyObject *substr,
8424 Py_ssize_t start,
8425 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008427 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008428 PyUnicodeObject* str_obj;
8429 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008430 int kind1, kind2, kind;
8431 void *buf1 = NULL, *buf2 = NULL;
8432 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008433
Thomas Wouters477c8d52006-05-27 19:21:47 +00008434 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008437 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008438 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 Py_DECREF(str_obj);
8440 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441 }
Tim Petersced69f82003-09-16 20:30:58 +00008442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008443 kind1 = PyUnicode_KIND(str_obj);
8444 kind2 = PyUnicode_KIND(sub_obj);
8445 kind = kind1 > kind2 ? kind1 : kind2;
8446 buf1 = PyUnicode_DATA(str_obj);
8447 if (kind1 != kind)
8448 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8449 if (!buf1)
8450 goto onError;
8451 buf2 = PyUnicode_DATA(sub_obj);
8452 if (kind2 != kind)
8453 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8454 if (!buf2)
8455 goto onError;
8456 len1 = PyUnicode_GET_LENGTH(str_obj);
8457 len2 = PyUnicode_GET_LENGTH(sub_obj);
8458
8459 ADJUST_INDICES(start, end, len1);
8460 switch(kind) {
8461 case PyUnicode_1BYTE_KIND:
8462 result = ucs1lib_count(
8463 ((Py_UCS1*)buf1) + start, end - start,
8464 buf2, len2, PY_SSIZE_T_MAX
8465 );
8466 break;
8467 case PyUnicode_2BYTE_KIND:
8468 result = ucs2lib_count(
8469 ((Py_UCS2*)buf1) + start, end - start,
8470 buf2, len2, PY_SSIZE_T_MAX
8471 );
8472 break;
8473 case PyUnicode_4BYTE_KIND:
8474 result = ucs4lib_count(
8475 ((Py_UCS4*)buf1) + start, end - start,
8476 buf2, len2, PY_SSIZE_T_MAX
8477 );
8478 break;
8479 default:
8480 assert(0); result = 0;
8481 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008482
8483 Py_DECREF(sub_obj);
8484 Py_DECREF(str_obj);
8485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486 if (kind1 != kind)
8487 PyMem_Free(buf1);
8488 if (kind2 != kind)
8489 PyMem_Free(buf2);
8490
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 onError:
8493 Py_DECREF(sub_obj);
8494 Py_DECREF(str_obj);
8495 if (kind1 != kind && buf1)
8496 PyMem_Free(buf1);
8497 if (kind2 != kind && buf2)
8498 PyMem_Free(buf2);
8499 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500}
8501
Alexander Belopolsky40018472011-02-26 01:02:56 +00008502Py_ssize_t
8503PyUnicode_Find(PyObject *str,
8504 PyObject *sub,
8505 Py_ssize_t start,
8506 Py_ssize_t end,
8507 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008509 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008510
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008514 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 Py_DECREF(str);
8517 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518 }
Tim Petersced69f82003-09-16 20:30:58 +00008519
Thomas Wouters477c8d52006-05-27 19:21:47 +00008520 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 result = any_find_slice(
8522 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8523 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008524 );
8525 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 result = any_find_slice(
8527 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8528 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008529 );
8530
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008532 Py_DECREF(sub);
8533
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534 return result;
8535}
8536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537Py_ssize_t
8538PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8539 Py_ssize_t start, Py_ssize_t end,
8540 int direction)
8541{
8542 char *result;
8543 int kind;
8544 if (PyUnicode_READY(str) == -1)
8545 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008546 if (start < 0 || end < 0) {
8547 PyErr_SetString(PyExc_IndexError, "string index out of range");
8548 return -2;
8549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 if (end > PyUnicode_GET_LENGTH(str))
8551 end = PyUnicode_GET_LENGTH(str);
8552 kind = PyUnicode_KIND(str);
8553 result = findchar(PyUnicode_1BYTE_DATA(str)
8554 + PyUnicode_KIND_SIZE(kind, start),
8555 kind,
8556 end-start, ch, direction);
8557 if (!result)
8558 return -1;
8559 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8560}
8561
Alexander Belopolsky40018472011-02-26 01:02:56 +00008562static int
8563tailmatch(PyUnicodeObject *self,
8564 PyUnicodeObject *substring,
8565 Py_ssize_t start,
8566 Py_ssize_t end,
8567 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 int kind_self;
8570 int kind_sub;
8571 void *data_self;
8572 void *data_sub;
8573 Py_ssize_t offset;
8574 Py_ssize_t i;
8575 Py_ssize_t end_sub;
8576
8577 if (PyUnicode_READY(self) == -1 ||
8578 PyUnicode_READY(substring) == -1)
8579 return 0;
8580
8581 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582 return 1;
8583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8585 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 kind_self = PyUnicode_KIND(self);
8590 data_self = PyUnicode_DATA(self);
8591 kind_sub = PyUnicode_KIND(substring);
8592 data_sub = PyUnicode_DATA(substring);
8593 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8594
8595 if (direction > 0)
8596 offset = end;
8597 else
8598 offset = start;
8599
8600 if (PyUnicode_READ(kind_self, data_self, offset) ==
8601 PyUnicode_READ(kind_sub, data_sub, 0) &&
8602 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8603 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8604 /* If both are of the same kind, memcmp is sufficient */
8605 if (kind_self == kind_sub) {
8606 return ! memcmp((char *)data_self +
8607 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8608 data_sub,
8609 PyUnicode_GET_LENGTH(substring) *
8610 PyUnicode_CHARACTER_SIZE(substring));
8611 }
8612 /* otherwise we have to compare each character by first accesing it */
8613 else {
8614 /* We do not need to compare 0 and len(substring)-1 because
8615 the if statement above ensured already that they are equal
8616 when we end up here. */
8617 // TODO: honor direction and do a forward or backwards search
8618 for (i = 1; i < end_sub; ++i) {
8619 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8620 PyUnicode_READ(kind_sub, data_sub, i))
8621 return 0;
8622 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008623 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 }
8626
8627 return 0;
8628}
8629
Alexander Belopolsky40018472011-02-26 01:02:56 +00008630Py_ssize_t
8631PyUnicode_Tailmatch(PyObject *str,
8632 PyObject *substr,
8633 Py_ssize_t start,
8634 Py_ssize_t end,
8635 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008637 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008638
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639 str = PyUnicode_FromObject(str);
8640 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 substr = PyUnicode_FromObject(substr);
8643 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 Py_DECREF(str);
8645 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 }
Tim Petersced69f82003-09-16 20:30:58 +00008647
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 (PyUnicodeObject *)substr,
8650 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 Py_DECREF(str);
8652 Py_DECREF(substr);
8653 return result;
8654}
8655
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656/* Apply fixfct filter to the Unicode object self and return a
8657 reference to the modified object */
8658
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659static PyObject *
8660fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 PyObject *u;
8664 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 if (PyUnicode_READY(self) == -1)
8667 return NULL;
8668 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8669 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8670 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8675 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 /* fix functions return the new maximum character in a string,
8678 if the kind of the resulting unicode object does not change,
8679 everything is fine. Otherwise we need to change the string kind
8680 and re-run the fix function. */
8681 maxchar_new = fixfct((PyUnicodeObject*)u);
8682 if (maxchar_new == 0)
8683 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8684 else if (maxchar_new <= 127)
8685 maxchar_new = 127;
8686 else if (maxchar_new <= 255)
8687 maxchar_new = 255;
8688 else if (maxchar_new <= 65535)
8689 maxchar_new = 65535;
8690 else
8691 maxchar_new = 1114111; /* 0x10ffff */
8692
8693 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 /* fixfct should return TRUE if it modified the buffer. If
8695 FALSE, return a reference to the original buffer instead
8696 (to save space, not time) */
8697 Py_INCREF(self);
8698 Py_DECREF(u);
8699 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 else if (maxchar_new == maxchar_old) {
8702 return u;
8703 }
8704 else {
8705 /* In case the maximum character changed, we need to
8706 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008707 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708 if (v == NULL) {
8709 Py_DECREF(u);
8710 return NULL;
8711 }
8712 if (maxchar_new > maxchar_old) {
8713 /* If the maxchar increased so that the kind changed, not all
8714 characters are representable anymore and we need to fix the
8715 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008716 if (PyUnicode_CopyCharacters(v, 0,
8717 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008718 PyUnicode_GET_LENGTH(self)) < 0)
8719 {
8720 Py_DECREF(u);
8721 return NULL;
8722 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 maxchar_old = fixfct((PyUnicodeObject*)v);
8724 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8725 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008726 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008727 if (PyUnicode_CopyCharacters(v, 0,
8728 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008729 PyUnicode_GET_LENGTH(self)) < 0)
8730 {
8731 Py_DECREF(u);
8732 return NULL;
8733 }
8734 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735
8736 Py_DECREF(u);
8737 return v;
8738 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739}
8740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008742fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744 /* No need to call PyUnicode_READY(self) because this function is only
8745 called as a callback from fixup() which does it already. */
8746 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8747 const int kind = PyUnicode_KIND(self);
8748 void *data = PyUnicode_DATA(self);
8749 int touched = 0;
8750 Py_UCS4 maxchar = 0;
8751 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 for (i = 0; i < len; ++i) {
8754 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8755 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8756 if (up != ch) {
8757 if (up > maxchar)
8758 maxchar = up;
8759 PyUnicode_WRITE(kind, data, i, up);
8760 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 else if (ch > maxchar)
8763 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 }
8765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008766 if (touched)
8767 return maxchar;
8768 else
8769 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770}
8771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008773fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8776 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8777 const int kind = PyUnicode_KIND(self);
8778 void *data = PyUnicode_DATA(self);
8779 int touched = 0;
8780 Py_UCS4 maxchar = 0;
8781 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 for(i = 0; i < len; ++i) {
8784 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8785 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8786 if (lo != ch) {
8787 if (lo > maxchar)
8788 maxchar = lo;
8789 PyUnicode_WRITE(kind, data, i, lo);
8790 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 else if (ch > maxchar)
8793 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794 }
8795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 if (touched)
8797 return maxchar;
8798 else
8799 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800}
8801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008803fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8806 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8807 const int kind = PyUnicode_KIND(self);
8808 void *data = PyUnicode_DATA(self);
8809 int touched = 0;
8810 Py_UCS4 maxchar = 0;
8811 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 for(i = 0; i < len; ++i) {
8814 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8815 Py_UCS4 nu = 0;
8816
8817 if (Py_UNICODE_ISUPPER(ch))
8818 nu = Py_UNICODE_TOLOWER(ch);
8819 else if (Py_UNICODE_ISLOWER(ch))
8820 nu = Py_UNICODE_TOUPPER(ch);
8821
8822 if (nu != 0) {
8823 if (nu > maxchar)
8824 maxchar = nu;
8825 PyUnicode_WRITE(kind, data, i, nu);
8826 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 else if (ch > maxchar)
8829 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008830 }
8831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 if (touched)
8833 return maxchar;
8834 else
8835 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836}
8837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008839fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8842 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8843 const int kind = PyUnicode_KIND(self);
8844 void *data = PyUnicode_DATA(self);
8845 int touched = 0;
8846 Py_UCS4 maxchar = 0;
8847 Py_ssize_t i = 0;
8848 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008849
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008850 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852
8853 ch = PyUnicode_READ(kind, data, i);
8854 if (!Py_UNICODE_ISUPPER(ch)) {
8855 maxchar = Py_UNICODE_TOUPPER(ch);
8856 PyUnicode_WRITE(kind, data, i, maxchar);
8857 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008859 ++i;
8860 for(; i < len; ++i) {
8861 ch = PyUnicode_READ(kind, data, i);
8862 if (!Py_UNICODE_ISLOWER(ch)) {
8863 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8864 if (lo > maxchar)
8865 maxchar = lo;
8866 PyUnicode_WRITE(kind, data, i, lo);
8867 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008869 else if (ch > maxchar)
8870 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008871 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872
8873 if (touched)
8874 return maxchar;
8875 else
8876 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877}
8878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008880fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8883 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8884 const int kind = PyUnicode_KIND(self);
8885 void *data = PyUnicode_DATA(self);
8886 Py_UCS4 maxchar = 0;
8887 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888 int previous_is_cased;
8889
8890 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 if (len == 1) {
8892 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8893 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8894 if (ti != ch) {
8895 PyUnicode_WRITE(kind, data, i, ti);
8896 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 }
8898 else
8899 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 for(; i < len; ++i) {
8903 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8904 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008905
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 nu = Py_UNICODE_TOTITLE(ch);
8910
8911 if (nu > maxchar)
8912 maxchar = nu;
8913 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008914
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 if (Py_UNICODE_ISLOWER(ch) ||
8916 Py_UNICODE_ISUPPER(ch) ||
8917 Py_UNICODE_ISTITLE(ch))
8918 previous_is_cased = 1;
8919 else
8920 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923}
8924
Tim Peters8ce9f162004-08-27 01:49:32 +00008925PyObject *
8926PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008929 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008931 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008932 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8933 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008934 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 Py_ssize_t sz, i, res_offset;
8936 Py_UCS4 maxchar = 0;
8937 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938
Tim Peters05eba1f2004-08-27 21:32:02 +00008939 fseq = PySequence_Fast(seq, "");
8940 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008941 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008942 }
8943
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008944 /* NOTE: the following code can't call back into Python code,
8945 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008946 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008947
Tim Peters05eba1f2004-08-27 21:32:02 +00008948 seqlen = PySequence_Fast_GET_SIZE(fseq);
8949 /* If empty sequence, return u"". */
8950 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008952 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008953 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008954 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008955 /* If singleton sequence with an exact Unicode, return that. */
8956 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 item = items[0];
8958 if (PyUnicode_CheckExact(item)) {
8959 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 goto Done;
8962 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008963 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008964 else {
8965 /* Set up sep and seplen */
8966 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 /* fall back to a blank space separator */
8968 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008969 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008971 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008972 else {
8973 if (!PyUnicode_Check(separator)) {
8974 PyErr_Format(PyExc_TypeError,
8975 "separator: expected str instance,"
8976 " %.80s found",
8977 Py_TYPE(separator)->tp_name);
8978 goto onError;
8979 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008980 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 goto onError;
8982 sep = separator;
8983 seplen = PyUnicode_GET_LENGTH(separator);
8984 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
Georg Brandl7597add2011-10-05 16:36:47 +02008985 /* inc refcount to keep this code path symmetric with the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986 above case of a blank separator */
8987 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008988 }
8989 }
8990
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008991 /* There are at least two things to join, or else we have a subclass
8992 * of str in the sequence.
8993 * Do a pre-pass to figure out the total amount of space we'll
8994 * need (sz), and see whether all argument are strings.
8995 */
8996 sz = 0;
8997 for (i = 0; i < seqlen; i++) {
8998 const Py_ssize_t old_sz = sz;
8999 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 if (!PyUnicode_Check(item)) {
9001 PyErr_Format(PyExc_TypeError,
9002 "sequence item %zd: expected str instance,"
9003 " %.80s found",
9004 i, Py_TYPE(item)->tp_name);
9005 goto onError;
9006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007 if (PyUnicode_READY(item) == -1)
9008 goto onError;
9009 sz += PyUnicode_GET_LENGTH(item);
9010 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9011 if (item_maxchar > maxchar)
9012 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009013 if (i != 0)
9014 sz += seplen;
9015 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9016 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009018 goto onError;
9019 }
9020 }
Tim Petersced69f82003-09-16 20:30:58 +00009021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009023 if (res == NULL)
9024 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009025
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009026 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02009028 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009029 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009030 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009031 if (i && seplen != 0) {
9032 copied = PyUnicode_CopyCharacters(res, res_offset,
9033 sep, 0, seplen);
9034 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009035 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009036#ifdef Py_DEBUG
9037 res_offset += copied;
9038#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009040#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00009041 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009042 itemlen = PyUnicode_GET_LENGTH(item);
9043 if (itemlen != 0) {
9044 copied = PyUnicode_CopyCharacters(res, res_offset,
9045 item, 0, itemlen);
9046 if (copied < 0)
9047 goto onError;
9048#ifdef Py_DEBUG
9049 res_offset += copied;
9050#else
9051 res_offset += itemlen;
9052#endif
9053 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009056
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00009058 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 Py_XDECREF(sep);
9060 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061
Benjamin Peterson29060642009-01-31 22:14:21 +00009062 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009063 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009065 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066 return NULL;
9067}
9068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069#define FILL(kind, data, value, start, length) \
9070 do { \
9071 Py_ssize_t i_ = 0; \
9072 assert(kind != PyUnicode_WCHAR_KIND); \
9073 switch ((kind)) { \
9074 case PyUnicode_1BYTE_KIND: { \
9075 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9076 memset(to_, (unsigned char)value, length); \
9077 break; \
9078 } \
9079 case PyUnicode_2BYTE_KIND: { \
9080 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9081 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9082 break; \
9083 } \
9084 default: { \
9085 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9086 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9087 break; \
9088 } \
9089 } \
9090 } while (0)
9091
Alexander Belopolsky40018472011-02-26 01:02:56 +00009092static PyUnicodeObject *
9093pad(PyUnicodeObject *self,
9094 Py_ssize_t left,
9095 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 PyObject *u;
9099 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009100 int kind;
9101 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102
9103 if (left < 0)
9104 left = 0;
9105 if (right < 0)
9106 right = 0;
9107
Tim Peters7a29bd52001-09-12 03:03:31 +00009108 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109 Py_INCREF(self);
9110 return self;
9111 }
9112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9114 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009115 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9116 return NULL;
9117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9119 if (fill > maxchar)
9120 maxchar = fill;
9121 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009122 if (!u)
9123 return NULL;
9124
9125 kind = PyUnicode_KIND(u);
9126 data = PyUnicode_DATA(u);
9127 if (left)
9128 FILL(kind, data, fill, 0, left);
9129 if (right)
9130 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009131 if (PyUnicode_CopyCharacters(u, left,
9132 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009133 _PyUnicode_LENGTH(self)) < 0)
9134 {
9135 Py_DECREF(u);
9136 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 }
9138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142
Alexander Belopolsky40018472011-02-26 01:02:56 +00009143PyObject *
9144PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147
9148 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 switch(PyUnicode_KIND(string)) {
9153 case PyUnicode_1BYTE_KIND:
9154 list = ucs1lib_splitlines(
9155 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9156 PyUnicode_GET_LENGTH(string), keepends);
9157 break;
9158 case PyUnicode_2BYTE_KIND:
9159 list = ucs2lib_splitlines(
9160 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9161 PyUnicode_GET_LENGTH(string), keepends);
9162 break;
9163 case PyUnicode_4BYTE_KIND:
9164 list = ucs4lib_splitlines(
9165 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9166 PyUnicode_GET_LENGTH(string), keepends);
9167 break;
9168 default:
9169 assert(0);
9170 list = 0;
9171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172 Py_DECREF(string);
9173 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174}
9175
Alexander Belopolsky40018472011-02-26 01:02:56 +00009176static PyObject *
9177split(PyUnicodeObject *self,
9178 PyUnicodeObject *substring,
9179 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 int kind1, kind2, kind;
9182 void *buf1, *buf2;
9183 Py_ssize_t len1, len2;
9184 PyObject* out;
9185
Guido van Rossumd57fd912000-03-10 22:53:23 +00009186 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009187 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 if (PyUnicode_READY(self) == -1)
9190 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 if (substring == NULL)
9193 switch(PyUnicode_KIND(self)) {
9194 case PyUnicode_1BYTE_KIND:
9195 return ucs1lib_split_whitespace(
9196 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9197 PyUnicode_GET_LENGTH(self), maxcount
9198 );
9199 case PyUnicode_2BYTE_KIND:
9200 return ucs2lib_split_whitespace(
9201 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9202 PyUnicode_GET_LENGTH(self), maxcount
9203 );
9204 case PyUnicode_4BYTE_KIND:
9205 return ucs4lib_split_whitespace(
9206 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9207 PyUnicode_GET_LENGTH(self), maxcount
9208 );
9209 default:
9210 assert(0);
9211 return NULL;
9212 }
9213
9214 if (PyUnicode_READY(substring) == -1)
9215 return NULL;
9216
9217 kind1 = PyUnicode_KIND(self);
9218 kind2 = PyUnicode_KIND(substring);
9219 kind = kind1 > kind2 ? kind1 : kind2;
9220 buf1 = PyUnicode_DATA(self);
9221 buf2 = PyUnicode_DATA(substring);
9222 if (kind1 != kind)
9223 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9224 if (!buf1)
9225 return NULL;
9226 if (kind2 != kind)
9227 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9228 if (!buf2) {
9229 if (kind1 != kind) PyMem_Free(buf1);
9230 return NULL;
9231 }
9232 len1 = PyUnicode_GET_LENGTH(self);
9233 len2 = PyUnicode_GET_LENGTH(substring);
9234
9235 switch(kind) {
9236 case PyUnicode_1BYTE_KIND:
9237 out = ucs1lib_split(
9238 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9239 break;
9240 case PyUnicode_2BYTE_KIND:
9241 out = ucs2lib_split(
9242 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9243 break;
9244 case PyUnicode_4BYTE_KIND:
9245 out = ucs4lib_split(
9246 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9247 break;
9248 default:
9249 out = NULL;
9250 }
9251 if (kind1 != kind)
9252 PyMem_Free(buf1);
9253 if (kind2 != kind)
9254 PyMem_Free(buf2);
9255 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256}
9257
Alexander Belopolsky40018472011-02-26 01:02:56 +00009258static PyObject *
9259rsplit(PyUnicodeObject *self,
9260 PyUnicodeObject *substring,
9261 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 int kind1, kind2, kind;
9264 void *buf1, *buf2;
9265 Py_ssize_t len1, len2;
9266 PyObject* out;
9267
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009268 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009269 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 if (PyUnicode_READY(self) == -1)
9272 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 if (substring == NULL)
9275 switch(PyUnicode_KIND(self)) {
9276 case PyUnicode_1BYTE_KIND:
9277 return ucs1lib_rsplit_whitespace(
9278 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9279 PyUnicode_GET_LENGTH(self), maxcount
9280 );
9281 case PyUnicode_2BYTE_KIND:
9282 return ucs2lib_rsplit_whitespace(
9283 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9284 PyUnicode_GET_LENGTH(self), maxcount
9285 );
9286 case PyUnicode_4BYTE_KIND:
9287 return ucs4lib_rsplit_whitespace(
9288 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9289 PyUnicode_GET_LENGTH(self), maxcount
9290 );
9291 default:
9292 assert(0);
9293 return NULL;
9294 }
9295
9296 if (PyUnicode_READY(substring) == -1)
9297 return NULL;
9298
9299 kind1 = PyUnicode_KIND(self);
9300 kind2 = PyUnicode_KIND(substring);
9301 kind = kind1 > kind2 ? kind1 : kind2;
9302 buf1 = PyUnicode_DATA(self);
9303 buf2 = PyUnicode_DATA(substring);
9304 if (kind1 != kind)
9305 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9306 if (!buf1)
9307 return NULL;
9308 if (kind2 != kind)
9309 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9310 if (!buf2) {
9311 if (kind1 != kind) PyMem_Free(buf1);
9312 return NULL;
9313 }
9314 len1 = PyUnicode_GET_LENGTH(self);
9315 len2 = PyUnicode_GET_LENGTH(substring);
9316
9317 switch(kind) {
9318 case PyUnicode_1BYTE_KIND:
9319 out = ucs1lib_rsplit(
9320 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9321 break;
9322 case PyUnicode_2BYTE_KIND:
9323 out = ucs2lib_rsplit(
9324 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9325 break;
9326 case PyUnicode_4BYTE_KIND:
9327 out = ucs4lib_rsplit(
9328 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9329 break;
9330 default:
9331 out = NULL;
9332 }
9333 if (kind1 != kind)
9334 PyMem_Free(buf1);
9335 if (kind2 != kind)
9336 PyMem_Free(buf2);
9337 return out;
9338}
9339
9340static Py_ssize_t
9341anylib_find(int kind, void *buf1, Py_ssize_t len1,
9342 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9343{
9344 switch(kind) {
9345 case PyUnicode_1BYTE_KIND:
9346 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9347 case PyUnicode_2BYTE_KIND:
9348 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9349 case PyUnicode_4BYTE_KIND:
9350 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9351 }
9352 assert(0);
9353 return -1;
9354}
9355
9356static Py_ssize_t
9357anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9358 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9359{
9360 switch(kind) {
9361 case PyUnicode_1BYTE_KIND:
9362 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9363 case PyUnicode_2BYTE_KIND:
9364 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9365 case PyUnicode_4BYTE_KIND:
9366 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9367 }
9368 assert(0);
9369 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009370}
9371
Alexander Belopolsky40018472011-02-26 01:02:56 +00009372static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373replace(PyObject *self, PyObject *str1,
9374 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 PyObject *u;
9377 char *sbuf = PyUnicode_DATA(self);
9378 char *buf1 = PyUnicode_DATA(str1);
9379 char *buf2 = PyUnicode_DATA(str2);
9380 int srelease = 0, release1 = 0, release2 = 0;
9381 int skind = PyUnicode_KIND(self);
9382 int kind1 = PyUnicode_KIND(str1);
9383 int kind2 = PyUnicode_KIND(str2);
9384 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9385 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9386 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387
9388 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009391 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 if (skind < kind1)
9394 /* substring too wide to be present */
9395 goto nothing;
9396
9397 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009398 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009399 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009401 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009403 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 Py_UCS4 u1, u2, maxchar;
9405 int mayshrink, rkind;
9406 u1 = PyUnicode_READ_CHAR(str1, 0);
9407 if (!findchar(sbuf, PyUnicode_KIND(self),
9408 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009409 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 u2 = PyUnicode_READ_CHAR(str2, 0);
9411 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9412 /* Replacing u1 with u2 may cause a maxchar reduction in the
9413 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 if (u2 > maxchar) {
9415 maxchar = u2;
9416 mayshrink = 0;
9417 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009418 else
9419 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009421 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009423 if (PyUnicode_CopyCharacters(u, 0,
9424 (PyObject*)self, 0, slen) < 0)
9425 {
9426 Py_DECREF(u);
9427 return NULL;
9428 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 rkind = PyUnicode_KIND(u);
9430 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9431 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009432 if (--maxcount < 0)
9433 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 if (mayshrink) {
9437 PyObject *tmp = u;
9438 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9439 PyUnicode_GET_LENGTH(tmp));
9440 Py_DECREF(tmp);
9441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 int rkind = skind;
9444 char *res;
9445 if (kind1 < rkind) {
9446 /* widen substring */
9447 buf1 = _PyUnicode_AsKind(str1, rkind);
9448 if (!buf1) goto error;
9449 release1 = 1;
9450 }
9451 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009452 if (i < 0)
9453 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 if (rkind > kind2) {
9455 /* widen replacement */
9456 buf2 = _PyUnicode_AsKind(str2, rkind);
9457 if (!buf2) goto error;
9458 release2 = 1;
9459 }
9460 else if (rkind < kind2) {
9461 /* widen self and buf1 */
9462 rkind = kind2;
9463 if (release1) PyMem_Free(buf1);
9464 sbuf = _PyUnicode_AsKind(self, rkind);
9465 if (!sbuf) goto error;
9466 srelease = 1;
9467 buf1 = _PyUnicode_AsKind(str1, rkind);
9468 if (!buf1) goto error;
9469 release1 = 1;
9470 }
9471 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9472 if (!res) {
9473 PyErr_NoMemory();
9474 goto error;
9475 }
9476 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009477 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9479 buf2,
9480 PyUnicode_KIND_SIZE(rkind, len2));
9481 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009482
9483 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9485 slen-i,
9486 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009487 if (i == -1)
9488 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9490 buf2,
9491 PyUnicode_KIND_SIZE(rkind, len2));
9492 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009493 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494
9495 u = PyUnicode_FromKindAndData(rkind, res, slen);
9496 PyMem_Free(res);
9497 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 Py_ssize_t n, i, j, ires;
9502 Py_ssize_t product, new_size;
9503 int rkind = skind;
9504 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 if (kind1 < rkind) {
9507 buf1 = _PyUnicode_AsKind(str1, rkind);
9508 if (!buf1) goto error;
9509 release1 = 1;
9510 }
9511 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009512 if (n == 0)
9513 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 if (kind2 < rkind) {
9515 buf2 = _PyUnicode_AsKind(str2, rkind);
9516 if (!buf2) goto error;
9517 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519 else if (kind2 > rkind) {
9520 rkind = kind2;
9521 sbuf = _PyUnicode_AsKind(self, rkind);
9522 if (!sbuf) goto error;
9523 srelease = 1;
9524 if (release1) PyMem_Free(buf1);
9525 buf1 = _PyUnicode_AsKind(str1, rkind);
9526 if (!buf1) goto error;
9527 release1 = 1;
9528 }
9529 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9530 PyUnicode_GET_LENGTH(str1))); */
9531 product = n * (len2-len1);
9532 if ((product / (len2-len1)) != n) {
9533 PyErr_SetString(PyExc_OverflowError,
9534 "replace string is too long");
9535 goto error;
9536 }
9537 new_size = slen + product;
9538 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9539 PyErr_SetString(PyExc_OverflowError,
9540 "replace string is too long");
9541 goto error;
9542 }
9543 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9544 if (!res)
9545 goto error;
9546 ires = i = 0;
9547 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009548 while (n-- > 0) {
9549 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 j = anylib_find(rkind,
9551 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9552 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009553 if (j == -1)
9554 break;
9555 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009556 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9558 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9559 PyUnicode_KIND_SIZE(rkind, j-i));
9560 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009561 }
9562 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 if (len2 > 0) {
9564 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9565 buf2,
9566 PyUnicode_KIND_SIZE(rkind, len2));
9567 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009572 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9574 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9575 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009576 } else {
9577 /* interleave */
9578 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9580 buf2,
9581 PyUnicode_KIND_SIZE(rkind, len2));
9582 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009583 if (--n <= 0)
9584 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009585 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9586 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9587 PyUnicode_KIND_SIZE(rkind, 1));
9588 ires++;
9589 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9592 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9593 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009596 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 if (srelease)
9599 PyMem_FREE(sbuf);
9600 if (release1)
9601 PyMem_FREE(buf1);
9602 if (release2)
9603 PyMem_FREE(buf2);
9604 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009605
Benjamin Peterson29060642009-01-31 22:14:21 +00009606 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009607 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 if (srelease)
9609 PyMem_FREE(sbuf);
9610 if (release1)
9611 PyMem_FREE(buf1);
9612 if (release2)
9613 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009614 if (PyUnicode_CheckExact(self)) {
9615 Py_INCREF(self);
9616 return (PyObject *) self;
9617 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009618 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 error:
9620 if (srelease && sbuf)
9621 PyMem_FREE(sbuf);
9622 if (release1 && buf1)
9623 PyMem_FREE(buf1);
9624 if (release2 && buf2)
9625 PyMem_FREE(buf2);
9626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627}
9628
9629/* --- Unicode Object Methods --------------------------------------------- */
9630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009631PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009632 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633\n\
9634Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009635characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636
9637static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009638unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 return fixup(self, fixtitle);
9641}
9642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009643PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009644 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645\n\
9646Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009647have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648
9649static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009650unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652 return fixup(self, fixcapitalize);
9653}
9654
9655#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009656PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009657 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658\n\
9659Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009660normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661
9662static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009663unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664{
9665 PyObject *list;
9666 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009667 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669 /* Split into words */
9670 list = split(self, NULL, -1);
9671 if (!list)
9672 return NULL;
9673
9674 /* Capitalize each word */
9675 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9676 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009677 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678 if (item == NULL)
9679 goto onError;
9680 Py_DECREF(PyList_GET_ITEM(list, i));
9681 PyList_SET_ITEM(list, i, item);
9682 }
9683
9684 /* Join the words to form a new string */
9685 item = PyUnicode_Join(NULL, list);
9686
Benjamin Peterson29060642009-01-31 22:14:21 +00009687 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 Py_DECREF(list);
9689 return (PyObject *)item;
9690}
9691#endif
9692
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009693/* Argument converter. Coerces to a single unicode character */
9694
9695static int
9696convert_uc(PyObject *obj, void *addr)
9697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009699 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009700
Benjamin Peterson14339b62009-01-31 16:36:08 +00009701 uniobj = PyUnicode_FromObject(obj);
9702 if (uniobj == NULL) {
9703 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009704 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009705 return 0;
9706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009708 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009709 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009710 Py_DECREF(uniobj);
9711 return 0;
9712 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009714 Py_DECREF(uniobj);
9715 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009716}
9717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009718PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009719 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009721Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009722done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723
9724static PyObject *
9725unicode_center(PyUnicodeObject *self, PyObject *args)
9726{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009727 Py_ssize_t marg, left;
9728 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729 Py_UCS4 fillchar = ' ';
9730
Victor Stinnere9a29352011-10-01 02:14:59 +02009731 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009733
Victor Stinnere9a29352011-10-01 02:14:59 +02009734 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735 return NULL;
9736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738 Py_INCREF(self);
9739 return (PyObject*) self;
9740 }
9741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743 left = marg / 2 + (marg & width & 1);
9744
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009745 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009746}
9747
Marc-André Lemburge5034372000-08-08 08:04:29 +00009748#if 0
9749
9750/* This code should go into some future Unicode collation support
9751 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009752 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009753
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009754/* speedy UTF-16 code point order comparison */
9755/* gleaned from: */
9756/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9757
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009758static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009759{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009760 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009761 0, 0, 0, 0, 0, 0, 0, 0,
9762 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009763 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009764};
9765
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766static int
9767unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9768{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009769 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009770
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771 Py_UNICODE *s1 = str1->str;
9772 Py_UNICODE *s2 = str2->str;
9773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 len1 = str1->_base._base.length;
9775 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009776
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009778 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009779
9780 c1 = *s1++;
9781 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009782
Benjamin Peterson29060642009-01-31 22:14:21 +00009783 if (c1 > (1<<11) * 26)
9784 c1 += utf16Fixup[c1>>11];
9785 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009786 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009787 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009788
9789 if (c1 != c2)
9790 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009791
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009792 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793 }
9794
9795 return (len1 < len2) ? -1 : (len1 != len2);
9796}
9797
Marc-André Lemburge5034372000-08-08 08:04:29 +00009798#else
9799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800/* This function assumes that str1 and str2 are readied by the caller. */
9801
Marc-André Lemburge5034372000-08-08 08:04:29 +00009802static int
9803unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 int kind1, kind2;
9806 void *data1, *data2;
9807 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 kind1 = PyUnicode_KIND(str1);
9810 kind2 = PyUnicode_KIND(str2);
9811 data1 = PyUnicode_DATA(str1);
9812 data2 = PyUnicode_DATA(str2);
9813 len1 = PyUnicode_GET_LENGTH(str1);
9814 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 for (i = 0; i < len1 && i < len2; ++i) {
9817 Py_UCS4 c1, c2;
9818 c1 = PyUnicode_READ(kind1, data1, i);
9819 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009820
9821 if (c1 != c2)
9822 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009823 }
9824
9825 return (len1 < len2) ? -1 : (len1 != len2);
9826}
9827
9828#endif
9829
Alexander Belopolsky40018472011-02-26 01:02:56 +00009830int
9831PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9834 if (PyUnicode_READY(left) == -1 ||
9835 PyUnicode_READY(right) == -1)
9836 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009837 return unicode_compare((PyUnicodeObject *)left,
9838 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009840 PyErr_Format(PyExc_TypeError,
9841 "Can't compare %.100s and %.100s",
9842 left->ob_type->tp_name,
9843 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844 return -1;
9845}
9846
Martin v. Löwis5b222132007-06-10 09:51:05 +00009847int
9848PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9849{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 Py_ssize_t i;
9851 int kind;
9852 void *data;
9853 Py_UCS4 chr;
9854
Victor Stinner910337b2011-10-03 03:20:16 +02009855 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 if (PyUnicode_READY(uni) == -1)
9857 return -1;
9858 kind = PyUnicode_KIND(uni);
9859 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009860 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9862 if (chr != str[i])
9863 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009864 /* This check keeps Python strings that end in '\0' from comparing equal
9865 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009867 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009868 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009869 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009870 return 0;
9871}
9872
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009873
Benjamin Peterson29060642009-01-31 22:14:21 +00009874#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009875 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009876
Alexander Belopolsky40018472011-02-26 01:02:56 +00009877PyObject *
9878PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009879{
9880 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009881
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009882 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9883 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 if (PyUnicode_READY(left) == -1 ||
9885 PyUnicode_READY(right) == -1)
9886 return NULL;
9887 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9888 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009889 if (op == Py_EQ) {
9890 Py_INCREF(Py_False);
9891 return Py_False;
9892 }
9893 if (op == Py_NE) {
9894 Py_INCREF(Py_True);
9895 return Py_True;
9896 }
9897 }
9898 if (left == right)
9899 result = 0;
9900 else
9901 result = unicode_compare((PyUnicodeObject *)left,
9902 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009903
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009904 /* Convert the return value to a Boolean */
9905 switch (op) {
9906 case Py_EQ:
9907 v = TEST_COND(result == 0);
9908 break;
9909 case Py_NE:
9910 v = TEST_COND(result != 0);
9911 break;
9912 case Py_LE:
9913 v = TEST_COND(result <= 0);
9914 break;
9915 case Py_GE:
9916 v = TEST_COND(result >= 0);
9917 break;
9918 case Py_LT:
9919 v = TEST_COND(result == -1);
9920 break;
9921 case Py_GT:
9922 v = TEST_COND(result == 1);
9923 break;
9924 default:
9925 PyErr_BadArgument();
9926 return NULL;
9927 }
9928 Py_INCREF(v);
9929 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009930 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009931
Brian Curtindfc80e32011-08-10 20:28:54 -05009932 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009933}
9934
Alexander Belopolsky40018472011-02-26 01:02:56 +00009935int
9936PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009937{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009938 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 int kind1, kind2, kind;
9940 void *buf1, *buf2;
9941 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009942 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009943
9944 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009945 sub = PyUnicode_FromObject(element);
9946 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009947 PyErr_Format(PyExc_TypeError,
9948 "'in <string>' requires string as left operand, not %s",
9949 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009950 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 if (PyUnicode_READY(sub) == -1)
9953 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009954
Thomas Wouters477c8d52006-05-27 19:21:47 +00009955 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009956 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009957 Py_DECREF(sub);
9958 return -1;
9959 }
9960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 kind1 = PyUnicode_KIND(str);
9962 kind2 = PyUnicode_KIND(sub);
9963 kind = kind1 > kind2 ? kind1 : kind2;
9964 buf1 = PyUnicode_DATA(str);
9965 buf2 = PyUnicode_DATA(sub);
9966 if (kind1 != kind)
9967 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9968 if (!buf1) {
9969 Py_DECREF(sub);
9970 return -1;
9971 }
9972 if (kind2 != kind)
9973 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9974 if (!buf2) {
9975 Py_DECREF(sub);
9976 if (kind1 != kind) PyMem_Free(buf1);
9977 return -1;
9978 }
9979 len1 = PyUnicode_GET_LENGTH(str);
9980 len2 = PyUnicode_GET_LENGTH(sub);
9981
9982 switch(kind) {
9983 case PyUnicode_1BYTE_KIND:
9984 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9985 break;
9986 case PyUnicode_2BYTE_KIND:
9987 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9988 break;
9989 case PyUnicode_4BYTE_KIND:
9990 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9991 break;
9992 default:
9993 result = -1;
9994 assert(0);
9995 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009996
9997 Py_DECREF(str);
9998 Py_DECREF(sub);
9999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 if (kind1 != kind)
10001 PyMem_Free(buf1);
10002 if (kind2 != kind)
10003 PyMem_Free(buf2);
10004
Guido van Rossum403d68b2000-03-13 15:55:09 +000010005 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010006}
10007
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008/* Concat to string or Unicode object giving a new Unicode object. */
10009
Alexander Belopolsky40018472011-02-26 01:02:56 +000010010PyObject *
10011PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 PyObject *u = NULL, *v = NULL, *w;
10014 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015
10016 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010019 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010022 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023
10024 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010025 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010029 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010030 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032 }
10033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010035 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 w = PyUnicode_New(
10039 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10040 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010042 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010043 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
10044 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +020010045 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010046 v, 0,
10047 PyUnicode_GET_LENGTH(v)) < 0)
10048 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010049 Py_DECREF(u);
10050 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010052
Benjamin Peterson29060642009-01-31 22:14:21 +000010053 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010054 Py_XDECREF(u);
10055 Py_XDECREF(v);
10056 return NULL;
10057}
10058
Victor Stinnerb0923652011-10-04 01:17:31 +020010059static void
10060unicode_append_inplace(PyObject **p_left, PyObject *right)
10061{
10062 Py_ssize_t left_len, right_len, new_len;
10063#ifdef Py_DEBUG
10064 Py_ssize_t copied;
10065#endif
10066
10067 assert(PyUnicode_IS_READY(*p_left));
10068 assert(PyUnicode_IS_READY(right));
10069
10070 left_len = PyUnicode_GET_LENGTH(*p_left);
10071 right_len = PyUnicode_GET_LENGTH(right);
10072 if (left_len > PY_SSIZE_T_MAX - right_len) {
10073 PyErr_SetString(PyExc_OverflowError,
10074 "strings are too large to concat");
10075 goto error;
10076 }
10077 new_len = left_len + right_len;
10078
10079 /* Now we own the last reference to 'left', so we can resize it
10080 * in-place.
10081 */
10082 if (unicode_resize(p_left, new_len) != 0) {
10083 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10084 * deallocated so it cannot be put back into
10085 * 'variable'. The MemoryError is raised when there
10086 * is no value in 'variable', which might (very
10087 * remotely) be a cause of incompatibilities.
10088 */
10089 goto error;
10090 }
10091 /* copy 'right' into the newly allocated area of 'left' */
10092#ifdef Py_DEBUG
10093 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10094 right, 0,
10095 right_len);
10096 assert(0 <= copied);
10097#else
10098 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10099#endif
10100 return;
10101
10102error:
10103 Py_DECREF(*p_left);
10104 *p_left = NULL;
10105}
10106
Walter Dörwald1ab83302007-05-18 17:15:44 +000010107void
Victor Stinner23e56682011-10-03 03:54:37 +020010108PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010109{
Victor Stinner23e56682011-10-03 03:54:37 +020010110 PyObject *left, *res;
10111
10112 if (p_left == NULL) {
10113 if (!PyErr_Occurred())
10114 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010115 return;
10116 }
Victor Stinner23e56682011-10-03 03:54:37 +020010117 left = *p_left;
10118 if (right == NULL || !PyUnicode_Check(left)) {
10119 if (!PyErr_Occurred())
10120 PyErr_BadInternalCall();
10121 goto error;
10122 }
10123
Victor Stinnere1335c72011-10-04 20:53:03 +020010124 if (PyUnicode_READY(left))
10125 goto error;
10126 if (PyUnicode_READY(right))
10127 goto error;
10128
Victor Stinner23e56682011-10-03 03:54:37 +020010129 if (PyUnicode_CheckExact(left) && left != unicode_empty
10130 && PyUnicode_CheckExact(right) && right != unicode_empty
10131 && unicode_resizable(left)
10132 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10133 || _PyUnicode_WSTR(left) != NULL))
10134 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010135 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10136 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010137 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010138 not so different than duplicating the string. */
10139 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010140 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010141 unicode_append_inplace(p_left, right);
Victor Stinner23e56682011-10-03 03:54:37 +020010142 return;
10143 }
10144 }
10145
10146 res = PyUnicode_Concat(left, right);
10147 if (res == NULL)
10148 goto error;
10149 Py_DECREF(left);
10150 *p_left = res;
10151 return;
10152
10153error:
10154 Py_DECREF(*p_left);
10155 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010156}
10157
10158void
10159PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10160{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010161 PyUnicode_Append(pleft, right);
10162 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010163}
10164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010165PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010166 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010168Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010169string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010170interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171
10172static PyObject *
10173unicode_count(PyUnicodeObject *self, PyObject *args)
10174{
10175 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010176 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010177 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 int kind1, kind2, kind;
10180 void *buf1, *buf2;
10181 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182
Jesus Ceaac451502011-04-20 17:09:23 +020010183 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10184 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 kind1 = PyUnicode_KIND(self);
10188 kind2 = PyUnicode_KIND(substring);
10189 kind = kind1 > kind2 ? kind1 : kind2;
10190 buf1 = PyUnicode_DATA(self);
10191 buf2 = PyUnicode_DATA(substring);
10192 if (kind1 != kind)
10193 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10194 if (!buf1) {
10195 Py_DECREF(substring);
10196 return NULL;
10197 }
10198 if (kind2 != kind)
10199 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10200 if (!buf2) {
10201 Py_DECREF(substring);
10202 if (kind1 != kind) PyMem_Free(buf1);
10203 return NULL;
10204 }
10205 len1 = PyUnicode_GET_LENGTH(self);
10206 len2 = PyUnicode_GET_LENGTH(substring);
10207
10208 ADJUST_INDICES(start, end, len1);
10209 switch(kind) {
10210 case PyUnicode_1BYTE_KIND:
10211 iresult = ucs1lib_count(
10212 ((Py_UCS1*)buf1) + start, end - start,
10213 buf2, len2, PY_SSIZE_T_MAX
10214 );
10215 break;
10216 case PyUnicode_2BYTE_KIND:
10217 iresult = ucs2lib_count(
10218 ((Py_UCS2*)buf1) + start, end - start,
10219 buf2, len2, PY_SSIZE_T_MAX
10220 );
10221 break;
10222 case PyUnicode_4BYTE_KIND:
10223 iresult = ucs4lib_count(
10224 ((Py_UCS4*)buf1) + start, end - start,
10225 buf2, len2, PY_SSIZE_T_MAX
10226 );
10227 break;
10228 default:
10229 assert(0); iresult = 0;
10230 }
10231
10232 result = PyLong_FromSsize_t(iresult);
10233
10234 if (kind1 != kind)
10235 PyMem_Free(buf1);
10236 if (kind2 != kind)
10237 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238
10239 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010240
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241 return result;
10242}
10243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010244PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010245 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010247Encode S using the codec registered for encoding. Default encoding\n\
10248is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010249handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010250a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10251'xmlcharrefreplace' as well as any other name registered with\n\
10252codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253
10254static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010255unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010257 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258 char *encoding = NULL;
10259 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010260
Benjamin Peterson308d6372009-09-18 21:42:35 +000010261 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10262 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010264 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010265}
10266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010267PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010268 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269\n\
10270Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010271If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272
10273static PyObject*
10274unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10275{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010276 Py_ssize_t i, j, line_pos, src_len, incr;
10277 Py_UCS4 ch;
10278 PyObject *u;
10279 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010281 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010282 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283
10284 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286
Antoine Pitrou22425222011-10-04 19:10:51 +020010287 if (PyUnicode_READY(self) == -1)
10288 return NULL;
10289
Thomas Wouters7e474022000-07-16 12:04:32 +000010290 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010291 src_len = PyUnicode_GET_LENGTH(self);
10292 i = j = line_pos = 0;
10293 kind = PyUnicode_KIND(self);
10294 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010295 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010296 for (; i < src_len; i++) {
10297 ch = PyUnicode_READ(kind, src_data, i);
10298 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010299 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010300 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010301 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010302 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010303 goto overflow;
10304 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010305 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010306 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010309 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010310 goto overflow;
10311 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010313 if (ch == '\n' || ch == '\r')
10314 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010316 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010317 if (!found && PyUnicode_CheckExact(self)) {
10318 Py_INCREF((PyObject *) self);
10319 return (PyObject *) self;
10320 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010321
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010323 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324 if (!u)
10325 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010326 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327
Antoine Pitroue71d5742011-10-04 15:55:09 +020010328 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329
Antoine Pitroue71d5742011-10-04 15:55:09 +020010330 for (; i < src_len; i++) {
10331 ch = PyUnicode_READ(kind, src_data, i);
10332 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010333 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010334 incr = tabsize - (line_pos % tabsize);
10335 line_pos += incr;
10336 while (incr--) {
10337 PyUnicode_WRITE(kind, dest_data, j, ' ');
10338 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010339 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010340 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010341 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010342 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010343 line_pos++;
10344 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010345 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010346 if (ch == '\n' || ch == '\r')
10347 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010349 }
10350 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010351#ifndef DONT_MAKE_RESULT_READY
10352 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 Py_DECREF(u);
10354 return NULL;
10355 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010358
Antoine Pitroue71d5742011-10-04 15:55:09 +020010359 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010360 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362}
10363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010364PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010365 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366\n\
10367Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010368such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369arguments start and end are interpreted as in slice notation.\n\
10370\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010371Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372
10373static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375{
Jesus Ceaac451502011-04-20 17:09:23 +020010376 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010377 Py_ssize_t start;
10378 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010379 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380
Jesus Ceaac451502011-04-20 17:09:23 +020010381 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10382 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 if (PyUnicode_READY(self) == -1)
10386 return NULL;
10387 if (PyUnicode_READY(substring) == -1)
10388 return NULL;
10389
10390 result = any_find_slice(
10391 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10392 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010393 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394
10395 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 if (result == -2)
10398 return NULL;
10399
Christian Heimes217cfd12007-12-02 14:31:20 +000010400 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401}
10402
10403static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010404unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010406 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10407 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410}
10411
Guido van Rossumc2504932007-09-18 19:42:40 +000010412/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010413 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010414static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010415unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416{
Guido van Rossumc2504932007-09-18 19:42:40 +000010417 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010418 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 if (_PyUnicode_HASH(self) != -1)
10421 return _PyUnicode_HASH(self);
10422 if (PyUnicode_READY(self) == -1)
10423 return -1;
10424 len = PyUnicode_GET_LENGTH(self);
10425
10426 /* The hash function as a macro, gets expanded three times below. */
10427#define HASH(P) \
10428 x = (Py_uhash_t)*P << 7; \
10429 while (--len >= 0) \
10430 x = (1000003*x) ^ (Py_uhash_t)*P++;
10431
10432 switch (PyUnicode_KIND(self)) {
10433 case PyUnicode_1BYTE_KIND: {
10434 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10435 HASH(c);
10436 break;
10437 }
10438 case PyUnicode_2BYTE_KIND: {
10439 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10440 HASH(s);
10441 break;
10442 }
10443 default: {
10444 Py_UCS4 *l;
10445 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10446 "Impossible switch case in unicode_hash");
10447 l = PyUnicode_4BYTE_DATA(self);
10448 HASH(l);
10449 break;
10450 }
10451 }
10452 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10453
Guido van Rossumc2504932007-09-18 19:42:40 +000010454 if (x == -1)
10455 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010457 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010460
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010461PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010462 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010463\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010464Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010465
10466static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010469 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010470 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010471 Py_ssize_t start;
10472 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473
Jesus Ceaac451502011-04-20 17:09:23 +020010474 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10475 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010476 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 if (PyUnicode_READY(self) == -1)
10479 return NULL;
10480 if (PyUnicode_READY(substring) == -1)
10481 return NULL;
10482
10483 result = any_find_slice(
10484 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10485 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010486 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487
10488 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 if (result == -2)
10491 return NULL;
10492
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493 if (result < 0) {
10494 PyErr_SetString(PyExc_ValueError, "substring not found");
10495 return NULL;
10496 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010497
Christian Heimes217cfd12007-12-02 14:31:20 +000010498 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499}
10500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010501PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010504Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010505at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010506
10507static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010508unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 Py_ssize_t i, length;
10511 int kind;
10512 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010513 int cased;
10514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 if (PyUnicode_READY(self) == -1)
10516 return NULL;
10517 length = PyUnicode_GET_LENGTH(self);
10518 kind = PyUnicode_KIND(self);
10519 data = PyUnicode_DATA(self);
10520
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (length == 1)
10523 return PyBool_FromLong(
10524 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010526 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010528 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010529
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 for (i = 0; i < length; i++) {
10532 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010533
Benjamin Peterson29060642009-01-31 22:14:21 +000010534 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10535 return PyBool_FromLong(0);
10536 else if (!cased && Py_UNICODE_ISLOWER(ch))
10537 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010539 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540}
10541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010542PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010543 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010545Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010546at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547
10548static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010549unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 Py_ssize_t i, length;
10552 int kind;
10553 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554 int cased;
10555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 if (PyUnicode_READY(self) == -1)
10557 return NULL;
10558 length = PyUnicode_GET_LENGTH(self);
10559 kind = PyUnicode_KIND(self);
10560 data = PyUnicode_DATA(self);
10561
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 if (length == 1)
10564 return PyBool_FromLong(
10565 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010566
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010567 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010569 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010570
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 for (i = 0; i < length; i++) {
10573 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010574
Benjamin Peterson29060642009-01-31 22:14:21 +000010575 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10576 return PyBool_FromLong(0);
10577 else if (!cased && Py_UNICODE_ISUPPER(ch))
10578 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010580 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581}
10582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010583PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010584 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010586Return True if S is a titlecased string and there is at least one\n\
10587character in S, i.e. upper- and titlecase characters may only\n\
10588follow uncased characters and lowercase characters only cased ones.\n\
10589Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590
10591static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010592unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 Py_ssize_t i, length;
10595 int kind;
10596 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597 int cased, previous_is_cased;
10598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 if (PyUnicode_READY(self) == -1)
10600 return NULL;
10601 length = PyUnicode_GET_LENGTH(self);
10602 kind = PyUnicode_KIND(self);
10603 data = PyUnicode_DATA(self);
10604
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 if (length == 1) {
10607 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10608 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10609 (Py_UNICODE_ISUPPER(ch) != 0));
10610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010612 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010614 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010615
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616 cased = 0;
10617 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 for (i = 0; i < length; i++) {
10619 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010620
Benjamin Peterson29060642009-01-31 22:14:21 +000010621 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10622 if (previous_is_cased)
10623 return PyBool_FromLong(0);
10624 previous_is_cased = 1;
10625 cased = 1;
10626 }
10627 else if (Py_UNICODE_ISLOWER(ch)) {
10628 if (!previous_is_cased)
10629 return PyBool_FromLong(0);
10630 previous_is_cased = 1;
10631 cased = 1;
10632 }
10633 else
10634 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010636 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637}
10638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010639PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010640 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010642Return True if all characters in S are whitespace\n\
10643and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010644
10645static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010646unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 Py_ssize_t i, length;
10649 int kind;
10650 void *data;
10651
10652 if (PyUnicode_READY(self) == -1)
10653 return NULL;
10654 length = PyUnicode_GET_LENGTH(self);
10655 kind = PyUnicode_KIND(self);
10656 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657
Guido van Rossumd57fd912000-03-10 22:53:23 +000010658 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 if (length == 1)
10660 return PyBool_FromLong(
10661 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010663 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010665 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 for (i = 0; i < length; i++) {
10668 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010669 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010670 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010672 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673}
10674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010675PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010676 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010677\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010678Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010679and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010680
10681static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010682unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010683{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 Py_ssize_t i, length;
10685 int kind;
10686 void *data;
10687
10688 if (PyUnicode_READY(self) == -1)
10689 return NULL;
10690 length = PyUnicode_GET_LENGTH(self);
10691 kind = PyUnicode_KIND(self);
10692 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010693
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010694 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 if (length == 1)
10696 return PyBool_FromLong(
10697 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010698
10699 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010701 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 for (i = 0; i < length; i++) {
10704 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010705 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010706 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010707 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010708}
10709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010710PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010711 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010712\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010713Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010714and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010715
10716static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010717unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010718{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 int kind;
10720 void *data;
10721 Py_ssize_t len, i;
10722
10723 if (PyUnicode_READY(self) == -1)
10724 return NULL;
10725
10726 kind = PyUnicode_KIND(self);
10727 data = PyUnicode_DATA(self);
10728 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010729
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010730 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 if (len == 1) {
10732 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10733 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10734 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010735
10736 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010738 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 for (i = 0; i < len; i++) {
10741 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010742 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010743 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010744 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010745 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010746}
10747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010748PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010751Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010752False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753
10754static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010755unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 Py_ssize_t i, length;
10758 int kind;
10759 void *data;
10760
10761 if (PyUnicode_READY(self) == -1)
10762 return NULL;
10763 length = PyUnicode_GET_LENGTH(self);
10764 kind = PyUnicode_KIND(self);
10765 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 if (length == 1)
10769 return PyBool_FromLong(
10770 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010772 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010774 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 for (i = 0; i < length; i++) {
10777 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010778 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010780 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781}
10782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010783PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010784 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010786Return True if all characters in S are digits\n\
10787and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788
10789static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010790unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 Py_ssize_t i, length;
10793 int kind;
10794 void *data;
10795
10796 if (PyUnicode_READY(self) == -1)
10797 return NULL;
10798 length = PyUnicode_GET_LENGTH(self);
10799 kind = PyUnicode_KIND(self);
10800 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 if (length == 1) {
10804 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10805 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10806 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010808 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010810 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 for (i = 0; i < length; i++) {
10813 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010814 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010816 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817}
10818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010819PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010820 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010822Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010823False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824
10825static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010826unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 Py_ssize_t i, length;
10829 int kind;
10830 void *data;
10831
10832 if (PyUnicode_READY(self) == -1)
10833 return NULL;
10834 length = PyUnicode_GET_LENGTH(self);
10835 kind = PyUnicode_KIND(self);
10836 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 if (length == 1)
10840 return PyBool_FromLong(
10841 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010843 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010845 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 for (i = 0; i < length; i++) {
10848 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010851 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852}
10853
Martin v. Löwis47383402007-08-15 07:32:56 +000010854int
10855PyUnicode_IsIdentifier(PyObject *self)
10856{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 int kind;
10858 void *data;
10859 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010860 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 if (PyUnicode_READY(self) == -1) {
10863 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010864 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 }
10866
10867 /* Special case for empty strings */
10868 if (PyUnicode_GET_LENGTH(self) == 0)
10869 return 0;
10870 kind = PyUnicode_KIND(self);
10871 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010872
10873 /* PEP 3131 says that the first character must be in
10874 XID_Start and subsequent characters in XID_Continue,
10875 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010876 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010877 letters, digits, underscore). However, given the current
10878 definition of XID_Start and XID_Continue, it is sufficient
10879 to check just for these, except that _ must be allowed
10880 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010882 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010883 return 0;
10884
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010885 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010887 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010888 return 1;
10889}
10890
10891PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010892 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010893\n\
10894Return True if S is a valid identifier according\n\
10895to the language definition.");
10896
10897static PyObject*
10898unicode_isidentifier(PyObject *self)
10899{
10900 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10901}
10902
Georg Brandl559e5d72008-06-11 18:37:52 +000010903PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010904 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010905\n\
10906Return True if all characters in S are considered\n\
10907printable in repr() or S is empty, False otherwise.");
10908
10909static PyObject*
10910unicode_isprintable(PyObject *self)
10911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 Py_ssize_t i, length;
10913 int kind;
10914 void *data;
10915
10916 if (PyUnicode_READY(self) == -1)
10917 return NULL;
10918 length = PyUnicode_GET_LENGTH(self);
10919 kind = PyUnicode_KIND(self);
10920 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010921
10922 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923 if (length == 1)
10924 return PyBool_FromLong(
10925 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 for (i = 0; i < length; i++) {
10928 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010929 Py_RETURN_FALSE;
10930 }
10931 }
10932 Py_RETURN_TRUE;
10933}
10934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010935PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010936 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937\n\
10938Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010939iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940
10941static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010942unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010944 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945}
10946
Martin v. Löwis18e16552006-02-15 17:27:45 +000010947static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948unicode_length(PyUnicodeObject *self)
10949{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 if (PyUnicode_READY(self) == -1)
10951 return -1;
10952 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953}
10954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010955PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010956 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010958Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010959done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960
10961static PyObject *
10962unicode_ljust(PyUnicodeObject *self, PyObject *args)
10963{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010964 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 Py_UCS4 fillchar = ' ';
10966
10967 if (PyUnicode_READY(self) == -1)
10968 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010969
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010970 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971 return NULL;
10972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974 Py_INCREF(self);
10975 return (PyObject*) self;
10976 }
10977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979}
10980
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010981PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010982 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010984Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985
10986static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010987unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989 return fixup(self, fixlower);
10990}
10991
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010992#define LEFTSTRIP 0
10993#define RIGHTSTRIP 1
10994#define BOTHSTRIP 2
10995
10996/* Arrays indexed by above */
10997static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10998
10999#define STRIPNAME(i) (stripformat[i]+3)
11000
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011001/* externally visible for str.strip(unicode) */
11002PyObject *
11003_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11004{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 void *data;
11006 int kind;
11007 Py_ssize_t i, j, len;
11008 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11011 return NULL;
11012
11013 kind = PyUnicode_KIND(self);
11014 data = PyUnicode_DATA(self);
11015 len = PyUnicode_GET_LENGTH(self);
11016 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11017 PyUnicode_DATA(sepobj),
11018 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011019
Benjamin Peterson14339b62009-01-31 16:36:08 +000011020 i = 0;
11021 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 while (i < len &&
11023 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011024 i++;
11025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011026 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011027
Benjamin Peterson14339b62009-01-31 16:36:08 +000011028 j = len;
11029 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011030 do {
11031 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011032 } while (j >= i &&
11033 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011034 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011035 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011036
Victor Stinner12bab6d2011-10-01 01:53:49 +020011037 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038}
11039
11040PyObject*
11041PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11042{
11043 unsigned char *data;
11044 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011045 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046
Victor Stinnerde636f32011-10-01 03:55:54 +020011047 if (PyUnicode_READY(self) == -1)
11048 return NULL;
11049
11050 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11051
Victor Stinner12bab6d2011-10-01 01:53:49 +020011052 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011054 if (PyUnicode_CheckExact(self)) {
11055 Py_INCREF(self);
11056 return self;
11057 }
11058 else
11059 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 }
11061
Victor Stinner12bab6d2011-10-01 01:53:49 +020011062 length = end - start;
11063 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011064 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065
Victor Stinnerde636f32011-10-01 03:55:54 +020011066 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011067 PyErr_SetString(PyExc_IndexError, "string index out of range");
11068 return NULL;
11069 }
11070
Victor Stinnerb9275c12011-10-05 14:01:42 +020011071 if (PyUnicode_IS_ASCII(self)) {
11072 kind = PyUnicode_KIND(self);
11073 data = PyUnicode_1BYTE_DATA(self);
11074 return unicode_fromascii(data + start, length);
11075 }
11076 else {
11077 kind = PyUnicode_KIND(self);
11078 data = PyUnicode_1BYTE_DATA(self);
11079 return PyUnicode_FromKindAndData(kind,
11080 data + PyUnicode_KIND_SIZE(kind, start),
11081 length);
11082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084
11085static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011086do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 int kind;
11089 void *data;
11090 Py_ssize_t len, i, j;
11091
11092 if (PyUnicode_READY(self) == -1)
11093 return NULL;
11094
11095 kind = PyUnicode_KIND(self);
11096 data = PyUnicode_DATA(self);
11097 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011098
Benjamin Peterson14339b62009-01-31 16:36:08 +000011099 i = 0;
11100 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011102 i++;
11103 }
11104 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011105
Benjamin Peterson14339b62009-01-31 16:36:08 +000011106 j = len;
11107 if (striptype != LEFTSTRIP) {
11108 do {
11109 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011111 j++;
11112 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011113
Victor Stinner12bab6d2011-10-01 01:53:49 +020011114 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115}
11116
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011117
11118static PyObject *
11119do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11120{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011121 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011122
Benjamin Peterson14339b62009-01-31 16:36:08 +000011123 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11124 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011125
Benjamin Peterson14339b62009-01-31 16:36:08 +000011126 if (sep != NULL && sep != Py_None) {
11127 if (PyUnicode_Check(sep))
11128 return _PyUnicode_XStrip(self, striptype, sep);
11129 else {
11130 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011131 "%s arg must be None or str",
11132 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011133 return NULL;
11134 }
11135 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011136
Benjamin Peterson14339b62009-01-31 16:36:08 +000011137 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011138}
11139
11140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011141PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011142 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011143\n\
11144Return a copy of the string S with leading and trailing\n\
11145whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011146If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011147
11148static PyObject *
11149unicode_strip(PyUnicodeObject *self, PyObject *args)
11150{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011151 if (PyTuple_GET_SIZE(args) == 0)
11152 return do_strip(self, BOTHSTRIP); /* Common case */
11153 else
11154 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011155}
11156
11157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011158PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011159 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011160\n\
11161Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011162If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011163
11164static PyObject *
11165unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11166{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011167 if (PyTuple_GET_SIZE(args) == 0)
11168 return do_strip(self, LEFTSTRIP); /* Common case */
11169 else
11170 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011171}
11172
11173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011174PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011175 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011176\n\
11177Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011178If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011179
11180static PyObject *
11181unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11182{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011183 if (PyTuple_GET_SIZE(args) == 0)
11184 return do_strip(self, RIGHTSTRIP); /* Common case */
11185 else
11186 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011187}
11188
11189
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011191unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192{
11193 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195
Georg Brandl222de0f2009-04-12 12:01:50 +000011196 if (len < 1) {
11197 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011198 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200
Tim Peters7a29bd52001-09-12 03:03:31 +000011201 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202 /* no repeat, return original string */
11203 Py_INCREF(str);
11204 return (PyObject*) str;
11205 }
Tim Peters8f422462000-09-09 06:13:41 +000011206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 if (PyUnicode_READY(str) == -1)
11208 return NULL;
11209
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011210 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011211 PyErr_SetString(PyExc_OverflowError,
11212 "repeated string is too long");
11213 return NULL;
11214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218 if (!u)
11219 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011220 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 if (PyUnicode_GET_LENGTH(str) == 1) {
11223 const int kind = PyUnicode_KIND(str);
11224 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11225 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011226 if (kind == PyUnicode_1BYTE_KIND)
11227 memset(to, (unsigned char)fill_char, len);
11228 else {
11229 for (n = 0; n < len; ++n)
11230 PyUnicode_WRITE(kind, to, n, fill_char);
11231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 }
11233 else {
11234 /* number of characters copied this far */
11235 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11236 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11237 char *to = (char *) PyUnicode_DATA(u);
11238 Py_MEMCPY(to, PyUnicode_DATA(str),
11239 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 n = (done <= nchars-done) ? done : nchars-done;
11242 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011243 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245 }
11246
11247 return (PyObject*) u;
11248}
11249
Alexander Belopolsky40018472011-02-26 01:02:56 +000011250PyObject *
11251PyUnicode_Replace(PyObject *obj,
11252 PyObject *subobj,
11253 PyObject *replobj,
11254 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255{
11256 PyObject *self;
11257 PyObject *str1;
11258 PyObject *str2;
11259 PyObject *result;
11260
11261 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011262 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011265 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 Py_DECREF(self);
11267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268 }
11269 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011270 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 Py_DECREF(self);
11272 Py_DECREF(str1);
11273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 Py_DECREF(self);
11277 Py_DECREF(str1);
11278 Py_DECREF(str2);
11279 return result;
11280}
11281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011282PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011283 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284\n\
11285Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011286old replaced by new. If the optional argument count is\n\
11287given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288
11289static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011292 PyObject *str1;
11293 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011294 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295 PyObject *result;
11296
Martin v. Löwis18e16552006-02-15 17:27:45 +000011297 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011299 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 str1 = PyUnicode_FromObject(str1);
11302 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11303 return NULL;
11304 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011305 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 Py_DECREF(str1);
11307 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309
11310 result = replace(self, str1, str2, maxcount);
11311
11312 Py_DECREF(str1);
11313 Py_DECREF(str2);
11314 return result;
11315}
11316
Alexander Belopolsky40018472011-02-26 01:02:56 +000011317static PyObject *
11318unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011320 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 Py_ssize_t isize;
11322 Py_ssize_t osize, squote, dquote, i, o;
11323 Py_UCS4 max, quote;
11324 int ikind, okind;
11325 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011328 return NULL;
11329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 isize = PyUnicode_GET_LENGTH(unicode);
11331 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 /* Compute length of output, quote characters, and
11334 maximum character */
11335 osize = 2; /* quotes */
11336 max = 127;
11337 squote = dquote = 0;
11338 ikind = PyUnicode_KIND(unicode);
11339 for (i = 0; i < isize; i++) {
11340 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11341 switch (ch) {
11342 case '\'': squote++; osize++; break;
11343 case '"': dquote++; osize++; break;
11344 case '\\': case '\t': case '\r': case '\n':
11345 osize += 2; break;
11346 default:
11347 /* Fast-path ASCII */
11348 if (ch < ' ' || ch == 0x7f)
11349 osize += 4; /* \xHH */
11350 else if (ch < 0x7f)
11351 osize++;
11352 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11353 osize++;
11354 max = ch > max ? ch : max;
11355 }
11356 else if (ch < 0x100)
11357 osize += 4; /* \xHH */
11358 else if (ch < 0x10000)
11359 osize += 6; /* \uHHHH */
11360 else
11361 osize += 10; /* \uHHHHHHHH */
11362 }
11363 }
11364
11365 quote = '\'';
11366 if (squote) {
11367 if (dquote)
11368 /* Both squote and dquote present. Use squote,
11369 and escape them */
11370 osize += squote;
11371 else
11372 quote = '"';
11373 }
11374
11375 repr = PyUnicode_New(osize, max);
11376 if (repr == NULL)
11377 return NULL;
11378 okind = PyUnicode_KIND(repr);
11379 odata = PyUnicode_DATA(repr);
11380
11381 PyUnicode_WRITE(okind, odata, 0, quote);
11382 PyUnicode_WRITE(okind, odata, osize-1, quote);
11383
11384 for (i = 0, o = 1; i < isize; i++) {
11385 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011386
11387 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 if ((ch == quote) || (ch == '\\')) {
11389 PyUnicode_WRITE(okind, odata, o++, '\\');
11390 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011391 continue;
11392 }
11393
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011395 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 PyUnicode_WRITE(okind, odata, o++, '\\');
11397 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011398 }
11399 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 PyUnicode_WRITE(okind, odata, o++, '\\');
11401 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011402 }
11403 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 PyUnicode_WRITE(okind, odata, o++, '\\');
11405 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011406 }
11407
11408 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011409 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 PyUnicode_WRITE(okind, odata, o++, '\\');
11411 PyUnicode_WRITE(okind, odata, o++, 'x');
11412 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11413 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011414 }
11415
Georg Brandl559e5d72008-06-11 18:37:52 +000011416 /* Copy ASCII characters as-is */
11417 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011419 }
11420
Benjamin Peterson29060642009-01-31 22:14:21 +000011421 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011422 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011423 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011424 (categories Z* and C* except ASCII space)
11425 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011427 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 if (ch <= 0xff) {
11429 PyUnicode_WRITE(okind, odata, o++, '\\');
11430 PyUnicode_WRITE(okind, odata, o++, 'x');
11431 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11432 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011433 }
11434 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 else if (ch >= 0x10000) {
11436 PyUnicode_WRITE(okind, odata, o++, '\\');
11437 PyUnicode_WRITE(okind, odata, o++, 'U');
11438 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11439 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11440 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11441 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11442 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11443 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11444 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11445 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011446 }
11447 /* Map 16-bit characters to '\uxxxx' */
11448 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 PyUnicode_WRITE(okind, odata, o++, '\\');
11450 PyUnicode_WRITE(okind, odata, o++, 'u');
11451 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11452 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11453 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11454 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011455 }
11456 }
11457 /* Copy characters as-is */
11458 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011460 }
11461 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011464 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465}
11466
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011467PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011468 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469\n\
11470Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011471such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472arguments start and end are interpreted as in slice notation.\n\
11473\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011474Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475
11476static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478{
Jesus Ceaac451502011-04-20 17:09:23 +020011479 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011480 Py_ssize_t start;
11481 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011482 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
Jesus Ceaac451502011-04-20 17:09:23 +020011484 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11485 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 if (PyUnicode_READY(self) == -1)
11489 return NULL;
11490 if (PyUnicode_READY(substring) == -1)
11491 return NULL;
11492
11493 result = any_find_slice(
11494 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11495 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011496 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497
11498 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 if (result == -2)
11501 return NULL;
11502
Christian Heimes217cfd12007-12-02 14:31:20 +000011503 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504}
11505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011506PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011509Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
11511static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513{
Jesus Ceaac451502011-04-20 17:09:23 +020011514 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011515 Py_ssize_t start;
11516 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011517 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518
Jesus Ceaac451502011-04-20 17:09:23 +020011519 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11520 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 if (PyUnicode_READY(self) == -1)
11524 return NULL;
11525 if (PyUnicode_READY(substring) == -1)
11526 return NULL;
11527
11528 result = any_find_slice(
11529 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11530 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011531 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532
11533 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 if (result == -2)
11536 return NULL;
11537
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538 if (result < 0) {
11539 PyErr_SetString(PyExc_ValueError, "substring not found");
11540 return NULL;
11541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542
Christian Heimes217cfd12007-12-02 14:31:20 +000011543 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544}
11545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011546PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011547 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011549Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011550done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551
11552static PyObject *
11553unicode_rjust(PyUnicodeObject *self, PyObject *args)
11554{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011555 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 Py_UCS4 fillchar = ' ';
11557
Victor Stinnere9a29352011-10-01 02:14:59 +020011558 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011560
Victor Stinnere9a29352011-10-01 02:14:59 +020011561 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 return NULL;
11563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565 Py_INCREF(self);
11566 return (PyObject*) self;
11567 }
11568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570}
11571
Alexander Belopolsky40018472011-02-26 01:02:56 +000011572PyObject *
11573PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574{
11575 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011576
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577 s = PyUnicode_FromObject(s);
11578 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011579 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 if (sep != NULL) {
11581 sep = PyUnicode_FromObject(sep);
11582 if (sep == NULL) {
11583 Py_DECREF(s);
11584 return NULL;
11585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586 }
11587
11588 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11589
11590 Py_DECREF(s);
11591 Py_XDECREF(sep);
11592 return result;
11593}
11594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011595PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597\n\
11598Return a list of the words in S, using sep as the\n\
11599delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011600splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011601whitespace string is a separator and empty strings are\n\
11602removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603
11604static PyObject*
11605unicode_split(PyUnicodeObject *self, PyObject *args)
11606{
11607 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011608 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609
Martin v. Löwis18e16552006-02-15 17:27:45 +000011610 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611 return NULL;
11612
11613 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011614 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011616 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011618 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619}
11620
Thomas Wouters477c8d52006-05-27 19:21:47 +000011621PyObject *
11622PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11623{
11624 PyObject* str_obj;
11625 PyObject* sep_obj;
11626 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 int kind1, kind2, kind;
11628 void *buf1 = NULL, *buf2 = NULL;
11629 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011630
11631 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011632 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011634 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011636 Py_DECREF(str_obj);
11637 return NULL;
11638 }
11639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 kind1 = PyUnicode_KIND(str_in);
11641 kind2 = PyUnicode_KIND(sep_obj);
11642 kind = kind1 > kind2 ? kind1 : kind2;
11643 buf1 = PyUnicode_DATA(str_in);
11644 if (kind1 != kind)
11645 buf1 = _PyUnicode_AsKind(str_in, kind);
11646 if (!buf1)
11647 goto onError;
11648 buf2 = PyUnicode_DATA(sep_obj);
11649 if (kind2 != kind)
11650 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11651 if (!buf2)
11652 goto onError;
11653 len1 = PyUnicode_GET_LENGTH(str_obj);
11654 len2 = PyUnicode_GET_LENGTH(sep_obj);
11655
11656 switch(PyUnicode_KIND(str_in)) {
11657 case PyUnicode_1BYTE_KIND:
11658 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11659 break;
11660 case PyUnicode_2BYTE_KIND:
11661 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11662 break;
11663 case PyUnicode_4BYTE_KIND:
11664 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11665 break;
11666 default:
11667 assert(0);
11668 out = 0;
11669 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011670
11671 Py_DECREF(sep_obj);
11672 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 if (kind1 != kind)
11674 PyMem_Free(buf1);
11675 if (kind2 != kind)
11676 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011677
11678 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 onError:
11680 Py_DECREF(sep_obj);
11681 Py_DECREF(str_obj);
11682 if (kind1 != kind && buf1)
11683 PyMem_Free(buf1);
11684 if (kind2 != kind && buf2)
11685 PyMem_Free(buf2);
11686 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011687}
11688
11689
11690PyObject *
11691PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11692{
11693 PyObject* str_obj;
11694 PyObject* sep_obj;
11695 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 int kind1, kind2, kind;
11697 void *buf1 = NULL, *buf2 = NULL;
11698 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011699
11700 str_obj = PyUnicode_FromObject(str_in);
11701 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011702 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011703 sep_obj = PyUnicode_FromObject(sep_in);
11704 if (!sep_obj) {
11705 Py_DECREF(str_obj);
11706 return NULL;
11707 }
11708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 kind1 = PyUnicode_KIND(str_in);
11710 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011711 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 buf1 = PyUnicode_DATA(str_in);
11713 if (kind1 != kind)
11714 buf1 = _PyUnicode_AsKind(str_in, kind);
11715 if (!buf1)
11716 goto onError;
11717 buf2 = PyUnicode_DATA(sep_obj);
11718 if (kind2 != kind)
11719 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11720 if (!buf2)
11721 goto onError;
11722 len1 = PyUnicode_GET_LENGTH(str_obj);
11723 len2 = PyUnicode_GET_LENGTH(sep_obj);
11724
11725 switch(PyUnicode_KIND(str_in)) {
11726 case PyUnicode_1BYTE_KIND:
11727 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11728 break;
11729 case PyUnicode_2BYTE_KIND:
11730 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11731 break;
11732 case PyUnicode_4BYTE_KIND:
11733 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11734 break;
11735 default:
11736 assert(0);
11737 out = 0;
11738 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011739
11740 Py_DECREF(sep_obj);
11741 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 if (kind1 != kind)
11743 PyMem_Free(buf1);
11744 if (kind2 != kind)
11745 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011746
11747 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 onError:
11749 Py_DECREF(sep_obj);
11750 Py_DECREF(str_obj);
11751 if (kind1 != kind && buf1)
11752 PyMem_Free(buf1);
11753 if (kind2 != kind && buf2)
11754 PyMem_Free(buf2);
11755 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011756}
11757
11758PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011760\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011761Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011762the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011763found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011764
11765static PyObject*
11766unicode_partition(PyUnicodeObject *self, PyObject *separator)
11767{
11768 return PyUnicode_Partition((PyObject *)self, separator);
11769}
11770
11771PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011772 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011773\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011774Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011775the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011776separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011777
11778static PyObject*
11779unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11780{
11781 return PyUnicode_RPartition((PyObject *)self, separator);
11782}
11783
Alexander Belopolsky40018472011-02-26 01:02:56 +000011784PyObject *
11785PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011786{
11787 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011788
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011789 s = PyUnicode_FromObject(s);
11790 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011791 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 if (sep != NULL) {
11793 sep = PyUnicode_FromObject(sep);
11794 if (sep == NULL) {
11795 Py_DECREF(s);
11796 return NULL;
11797 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011798 }
11799
11800 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11801
11802 Py_DECREF(s);
11803 Py_XDECREF(sep);
11804 return result;
11805}
11806
11807PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011808 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011809\n\
11810Return a list of the words in S, using sep as the\n\
11811delimiter string, starting at the end of the string and\n\
11812working to the front. If maxsplit is given, at most maxsplit\n\
11813splits are done. If sep is not specified, any whitespace string\n\
11814is a separator.");
11815
11816static PyObject*
11817unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11818{
11819 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011820 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011821
Martin v. Löwis18e16552006-02-15 17:27:45 +000011822 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011823 return NULL;
11824
11825 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011826 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011827 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011829 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011831}
11832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011833PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835\n\
11836Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011837Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011838is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
11840static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011841unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011843 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011844 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011846 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11847 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848 return NULL;
11849
Guido van Rossum86662912000-04-11 15:38:46 +000011850 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851}
11852
11853static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011854PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855{
Walter Dörwald346737f2007-05-31 10:44:43 +000011856 if (PyUnicode_CheckExact(self)) {
11857 Py_INCREF(self);
11858 return self;
11859 } else
11860 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011861 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862}
11863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011864PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866\n\
11867Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011868and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869
11870static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011871unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 return fixup(self, fixswapcase);
11874}
11875
Georg Brandlceee0772007-11-27 23:48:05 +000011876PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011878\n\
11879Return a translation table usable for str.translate().\n\
11880If there is only one argument, it must be a dictionary mapping Unicode\n\
11881ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011882Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011883If there are two arguments, they must be strings of equal length, and\n\
11884in the resulting dictionary, each character in x will be mapped to the\n\
11885character at the same position in y. If there is a third argument, it\n\
11886must be a string, whose characters will be mapped to None in the result.");
11887
11888static PyObject*
11889unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11890{
11891 PyObject *x, *y = NULL, *z = NULL;
11892 PyObject *new = NULL, *key, *value;
11893 Py_ssize_t i = 0;
11894 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011895
Georg Brandlceee0772007-11-27 23:48:05 +000011896 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11897 return NULL;
11898 new = PyDict_New();
11899 if (!new)
11900 return NULL;
11901 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 int x_kind, y_kind, z_kind;
11903 void *x_data, *y_data, *z_data;
11904
Georg Brandlceee0772007-11-27 23:48:05 +000011905 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011906 if (!PyUnicode_Check(x)) {
11907 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11908 "be a string if there is a second argument");
11909 goto err;
11910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011912 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11913 "arguments must have equal length");
11914 goto err;
11915 }
11916 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 x_kind = PyUnicode_KIND(x);
11918 y_kind = PyUnicode_KIND(y);
11919 x_data = PyUnicode_DATA(x);
11920 y_data = PyUnicode_DATA(y);
11921 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11922 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11923 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011924 if (!key || !value)
11925 goto err;
11926 res = PyDict_SetItem(new, key, value);
11927 Py_DECREF(key);
11928 Py_DECREF(value);
11929 if (res < 0)
11930 goto err;
11931 }
11932 /* create entries for deleting chars in z */
11933 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 z_kind = PyUnicode_KIND(z);
11935 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011936 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011938 if (!key)
11939 goto err;
11940 res = PyDict_SetItem(new, key, Py_None);
11941 Py_DECREF(key);
11942 if (res < 0)
11943 goto err;
11944 }
11945 }
11946 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 int kind;
11948 void *data;
11949
Georg Brandlceee0772007-11-27 23:48:05 +000011950 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011951 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011952 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11953 "to maketrans it must be a dict");
11954 goto err;
11955 }
11956 /* copy entries into the new dict, converting string keys to int keys */
11957 while (PyDict_Next(x, &i, &key, &value)) {
11958 if (PyUnicode_Check(key)) {
11959 /* convert string keys to integer keys */
11960 PyObject *newkey;
11961 if (PyUnicode_GET_SIZE(key) != 1) {
11962 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11963 "table must be of length 1");
11964 goto err;
11965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 kind = PyUnicode_KIND(key);
11967 data = PyUnicode_DATA(key);
11968 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011969 if (!newkey)
11970 goto err;
11971 res = PyDict_SetItem(new, newkey, value);
11972 Py_DECREF(newkey);
11973 if (res < 0)
11974 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011975 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011976 /* just keep integer keys */
11977 if (PyDict_SetItem(new, key, value) < 0)
11978 goto err;
11979 } else {
11980 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11981 "be strings or integers");
11982 goto err;
11983 }
11984 }
11985 }
11986 return new;
11987 err:
11988 Py_DECREF(new);
11989 return NULL;
11990}
11991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011992PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011993 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994\n\
11995Return a copy of the string S, where all characters have been mapped\n\
11996through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011997Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011998Unmapped characters are left untouched. Characters mapped to None\n\
11999are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000
12001static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005}
12006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012007PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012010Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
12012static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012013unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015 return fixup(self, fixupper);
12016}
12017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012018PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012021Pad a numeric string S with zeros on the left, to fill a field\n\
12022of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023
12024static PyObject *
12025unicode_zfill(PyUnicodeObject *self, PyObject *args)
12026{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012027 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012029 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 int kind;
12031 void *data;
12032 Py_UCS4 chr;
12033
12034 if (PyUnicode_READY(self) == -1)
12035 return NULL;
12036
Martin v. Löwis18e16552006-02-15 17:27:45 +000012037 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038 return NULL;
12039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012041 if (PyUnicode_CheckExact(self)) {
12042 Py_INCREF(self);
12043 return (PyObject*) self;
12044 }
12045 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012046 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047 }
12048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050
12051 u = pad(self, fill, 0, '0');
12052
Walter Dörwald068325e2002-04-15 13:36:47 +000012053 if (u == NULL)
12054 return NULL;
12055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 kind = PyUnicode_KIND(u);
12057 data = PyUnicode_DATA(u);
12058 chr = PyUnicode_READ(kind, data, fill);
12059
12060 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 PyUnicode_WRITE(kind, data, 0, chr);
12063 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064 }
12065
12066 return (PyObject*) u;
12067}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068
12069#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012070static PyObject *
12071unicode__decimal2ascii(PyObject *self)
12072{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012074}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075#endif
12076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012077PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012078 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012080Return True if S starts with the specified prefix, False otherwise.\n\
12081With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012082With optional end, stop comparing S at that position.\n\
12083prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084
12085static PyObject *
12086unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012087 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012089 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012091 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012092 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012093 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094
Jesus Ceaac451502011-04-20 17:09:23 +020012095 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012096 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012097 if (PyTuple_Check(subobj)) {
12098 Py_ssize_t i;
12099 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12100 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012101 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012102 if (substring == NULL)
12103 return NULL;
12104 result = tailmatch(self, substring, start, end, -1);
12105 Py_DECREF(substring);
12106 if (result) {
12107 Py_RETURN_TRUE;
12108 }
12109 }
12110 /* nothing matched */
12111 Py_RETURN_FALSE;
12112 }
12113 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012114 if (substring == NULL) {
12115 if (PyErr_ExceptionMatches(PyExc_TypeError))
12116 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12117 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012118 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012119 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012120 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012122 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123}
12124
12125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012126PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012127 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012129Return True if S ends with the specified suffix, False otherwise.\n\
12130With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012131With optional end, stop comparing S at that position.\n\
12132suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133
12134static PyObject *
12135unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012138 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012140 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012141 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012142 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143
Jesus Ceaac451502011-04-20 17:09:23 +020012144 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012146 if (PyTuple_Check(subobj)) {
12147 Py_ssize_t i;
12148 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12149 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012150 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012151 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012152 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012153 result = tailmatch(self, substring, start, end, +1);
12154 Py_DECREF(substring);
12155 if (result) {
12156 Py_RETURN_TRUE;
12157 }
12158 }
12159 Py_RETURN_FALSE;
12160 }
12161 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012162 if (substring == NULL) {
12163 if (PyErr_ExceptionMatches(PyExc_TypeError))
12164 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12165 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012166 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012167 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012168 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012170 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171}
12172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012174
12175PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012176 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012177\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012178Return a formatted version of S, using substitutions from args and kwargs.\n\
12179The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012180
Eric Smith27bbca62010-11-04 17:06:58 +000012181PyDoc_STRVAR(format_map__doc__,
12182 "S.format_map(mapping) -> str\n\
12183\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012184Return a formatted version of S, using substitutions from mapping.\n\
12185The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012186
Eric Smith4a7d76d2008-05-30 18:10:19 +000012187static PyObject *
12188unicode__format__(PyObject* self, PyObject* args)
12189{
12190 PyObject *format_spec;
12191
12192 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12193 return NULL;
12194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12196 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012197}
12198
Eric Smith8c663262007-08-25 02:26:07 +000012199PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012200 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012201\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012202Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012203
12204static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012205unicode__sizeof__(PyUnicodeObject *v)
12206{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 Py_ssize_t size;
12208
12209 /* If it's a compact object, account for base structure +
12210 character data. */
12211 if (PyUnicode_IS_COMPACT_ASCII(v))
12212 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12213 else if (PyUnicode_IS_COMPACT(v))
12214 size = sizeof(PyCompactUnicodeObject) +
12215 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12216 else {
12217 /* If it is a two-block object, account for base object, and
12218 for character block if present. */
12219 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012220 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 size += (PyUnicode_GET_LENGTH(v) + 1) *
12222 PyUnicode_CHARACTER_SIZE(v);
12223 }
12224 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012225 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012226 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012228 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012229 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230
12231 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012232}
12233
12234PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012235 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012236
12237static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012238unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012239{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012240 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 if (!copy)
12242 return NULL;
12243 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012244}
12245
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246static PyMethodDef unicode_methods[] = {
12247
12248 /* Order is according to common usage: often used methods should
12249 appear first, since lookup is done sequentially. */
12250
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012251 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012252 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12253 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012254 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012255 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12256 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12257 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12258 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12259 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12260 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12261 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012262 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012263 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12264 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12265 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012266 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012267 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12268 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12269 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012270 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012271 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012272 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012273 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012274 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12275 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12276 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12277 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12278 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12279 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12280 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12281 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12282 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12283 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12284 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12285 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12286 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12287 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012288 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012289 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012290 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012291 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012292 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012293 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012294 {"maketrans", (PyCFunction) unicode_maketrans,
12295 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012296 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012297#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012298 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299#endif
12300
12301#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012302 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012303 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304#endif
12305
Benjamin Peterson14339b62009-01-31 16:36:08 +000012306 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307 {NULL, NULL}
12308};
12309
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012310static PyObject *
12311unicode_mod(PyObject *v, PyObject *w)
12312{
Brian Curtindfc80e32011-08-10 20:28:54 -050012313 if (!PyUnicode_Check(v))
12314 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012315 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012316}
12317
12318static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012319 0, /*nb_add*/
12320 0, /*nb_subtract*/
12321 0, /*nb_multiply*/
12322 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012323};
12324
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012326 (lenfunc) unicode_length, /* sq_length */
12327 PyUnicode_Concat, /* sq_concat */
12328 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12329 (ssizeargfunc) unicode_getitem, /* sq_item */
12330 0, /* sq_slice */
12331 0, /* sq_ass_item */
12332 0, /* sq_ass_slice */
12333 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334};
12335
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012336static PyObject*
12337unicode_subscript(PyUnicodeObject* self, PyObject* item)
12338{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 if (PyUnicode_READY(self) == -1)
12340 return NULL;
12341
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012342 if (PyIndex_Check(item)) {
12343 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012344 if (i == -1 && PyErr_Occurred())
12345 return NULL;
12346 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012348 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012349 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012350 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012351 PyObject *result;
12352 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012353 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012354 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012357 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012358 return NULL;
12359 }
12360
12361 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 return PyUnicode_New(0, 0);
12363 } else if (start == 0 && step == 1 &&
12364 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012365 PyUnicode_CheckExact(self)) {
12366 Py_INCREF(self);
12367 return (PyObject *)self;
12368 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012369 return PyUnicode_Substring((PyObject*)self,
12370 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012371 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012372 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012373 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012374 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012375 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012376 src_data = PyUnicode_DATA(self);
12377 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12378 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012379 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012380 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012381 if (max_char >= kind_limit)
12382 break;
12383 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012384 }
12385 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012386 if (result == NULL)
12387 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012388 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012389 dest_data = PyUnicode_DATA(result);
12390
12391 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012392 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12393 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012394 }
12395 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012396 } else {
12397 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12398 return NULL;
12399 }
12400}
12401
12402static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012403 (lenfunc)unicode_length, /* mp_length */
12404 (binaryfunc)unicode_subscript, /* mp_subscript */
12405 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012406};
12407
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409/* Helpers for PyUnicode_Format() */
12410
12411static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012412getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012414 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012416 (*p_argidx)++;
12417 if (arglen < 0)
12418 return args;
12419 else
12420 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421 }
12422 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012423 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424 return NULL;
12425}
12426
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012427/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012429static PyObject *
12430formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012431{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012432 char *p;
12433 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012435
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436 x = PyFloat_AsDouble(v);
12437 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012438 return NULL;
12439
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012441 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012442
Eric Smith0923d1d2009-04-16 20:16:10 +000012443 p = PyOS_double_to_string(x, type, prec,
12444 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012445 if (p == NULL)
12446 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012448 PyMem_Free(p);
12449 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450}
12451
Tim Peters38fd5b62000-09-21 05:43:11 +000012452static PyObject*
12453formatlong(PyObject *val, int flags, int prec, int type)
12454{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012455 char *buf;
12456 int len;
12457 PyObject *str; /* temporary string object. */
12458 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012459
Benjamin Peterson14339b62009-01-31 16:36:08 +000012460 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12461 if (!str)
12462 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 Py_DECREF(str);
12465 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012466}
12467
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012470 size_t buflen,
12471 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012473 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012474 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 if (PyUnicode_GET_LENGTH(v) == 1) {
12476 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012477 buf[1] = '\0';
12478 return 1;
12479 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012480 goto onError;
12481 }
12482 else {
12483 /* Integer input truncated to a character */
12484 long x;
12485 x = PyLong_AsLong(v);
12486 if (x == -1 && PyErr_Occurred())
12487 goto onError;
12488
12489 if (x < 0 || x > 0x10ffff) {
12490 PyErr_SetString(PyExc_OverflowError,
12491 "%c arg not in range(0x110000)");
12492 return -1;
12493 }
12494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012496 buf[1] = '\0';
12497 return 1;
12498 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012499
Benjamin Peterson29060642009-01-31 22:14:21 +000012500 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012501 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012502 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012503 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504}
12505
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012506/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012507 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012508*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012509#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012510
Alexander Belopolsky40018472011-02-26 01:02:56 +000012511PyObject *
12512PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 void *fmt;
12515 int fmtkind;
12516 PyObject *result;
12517 Py_UCS4 *res, *res0;
12518 Py_UCS4 max;
12519 int kind;
12520 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012524
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012526 PyErr_BadInternalCall();
12527 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12530 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012531 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 fmt = PyUnicode_DATA(uformat);
12533 fmtkind = PyUnicode_KIND(uformat);
12534 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12535 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536
12537 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12539 if (res0 == NULL) {
12540 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012541 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543
12544 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012545 arglen = PyTuple_Size(args);
12546 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547 }
12548 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012549 arglen = -1;
12550 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012552 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012553 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555
12556 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 if (--rescnt < 0) {
12559 rescnt = fmtcnt + 100;
12560 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12562 if (res0 == NULL){
12563 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 }
12566 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012567 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012570 }
12571 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 /* Got a format specifier */
12573 int flags = 0;
12574 Py_ssize_t width = -1;
12575 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 Py_UCS4 c = '\0';
12577 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 int isnumok;
12579 PyObject *v = NULL;
12580 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 void *pbuf;
12582 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012583 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 Py_ssize_t len, len1;
12585 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 fmtpos++;
12588 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12589 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012590 Py_ssize_t keylen;
12591 PyObject *key;
12592 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012593
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 if (dict == NULL) {
12595 PyErr_SetString(PyExc_TypeError,
12596 "format requires a mapping");
12597 goto onError;
12598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012600 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012602 /* Skip over balanced parentheses */
12603 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012605 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012607 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 if (fmtcnt < 0 || pcount > 0) {
12612 PyErr_SetString(PyExc_ValueError,
12613 "incomplete format key");
12614 goto onError;
12615 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012616 key = PyUnicode_Substring((PyObject*)uformat,
12617 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012618 if (key == NULL)
12619 goto onError;
12620 if (args_owned) {
12621 Py_DECREF(args);
12622 args_owned = 0;
12623 }
12624 args = PyObject_GetItem(dict, key);
12625 Py_DECREF(key);
12626 if (args == NULL) {
12627 goto onError;
12628 }
12629 args_owned = 1;
12630 arglen = -1;
12631 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012632 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012633 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012635 case '-': flags |= F_LJUST; continue;
12636 case '+': flags |= F_SIGN; continue;
12637 case ' ': flags |= F_BLANK; continue;
12638 case '#': flags |= F_ALT; continue;
12639 case '0': flags |= F_ZERO; continue;
12640 }
12641 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012642 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012643 if (c == '*') {
12644 v = getnextarg(args, arglen, &argidx);
12645 if (v == NULL)
12646 goto onError;
12647 if (!PyLong_Check(v)) {
12648 PyErr_SetString(PyExc_TypeError,
12649 "* wants int");
12650 goto onError;
12651 }
12652 width = PyLong_AsLong(v);
12653 if (width == -1 && PyErr_Occurred())
12654 goto onError;
12655 if (width < 0) {
12656 flags |= F_LJUST;
12657 width = -width;
12658 }
12659 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012661 }
12662 else if (c >= '0' && c <= '9') {
12663 width = c - '0';
12664 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012666 if (c < '0' || c > '9')
12667 break;
12668 if ((width*10) / 10 != width) {
12669 PyErr_SetString(PyExc_ValueError,
12670 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012671 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012672 }
12673 width = width*10 + (c - '0');
12674 }
12675 }
12676 if (c == '.') {
12677 prec = 0;
12678 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012680 if (c == '*') {
12681 v = getnextarg(args, arglen, &argidx);
12682 if (v == NULL)
12683 goto onError;
12684 if (!PyLong_Check(v)) {
12685 PyErr_SetString(PyExc_TypeError,
12686 "* wants int");
12687 goto onError;
12688 }
12689 prec = PyLong_AsLong(v);
12690 if (prec == -1 && PyErr_Occurred())
12691 goto onError;
12692 if (prec < 0)
12693 prec = 0;
12694 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012696 }
12697 else if (c >= '0' && c <= '9') {
12698 prec = c - '0';
12699 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012701 if (c < '0' || c > '9')
12702 break;
12703 if ((prec*10) / 10 != prec) {
12704 PyErr_SetString(PyExc_ValueError,
12705 "prec too big");
12706 goto onError;
12707 }
12708 prec = prec*10 + (c - '0');
12709 }
12710 }
12711 } /* prec */
12712 if (fmtcnt >= 0) {
12713 if (c == 'h' || c == 'l' || c == 'L') {
12714 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012716 }
12717 }
12718 if (fmtcnt < 0) {
12719 PyErr_SetString(PyExc_ValueError,
12720 "incomplete format");
12721 goto onError;
12722 }
12723 if (c != '%') {
12724 v = getnextarg(args, arglen, &argidx);
12725 if (v == NULL)
12726 goto onError;
12727 }
12728 sign = 0;
12729 fill = ' ';
12730 switch (c) {
12731
12732 case '%':
12733 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012735 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012737 len = 1;
12738 break;
12739
12740 case 's':
12741 case 'r':
12742 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012743 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 temp = v;
12745 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012746 }
12747 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012748 if (c == 's')
12749 temp = PyObject_Str(v);
12750 else if (c == 'r')
12751 temp = PyObject_Repr(v);
12752 else
12753 temp = PyObject_ASCII(v);
12754 if (temp == NULL)
12755 goto onError;
12756 if (PyUnicode_Check(temp))
12757 /* nothing to do */;
12758 else {
12759 Py_DECREF(temp);
12760 PyErr_SetString(PyExc_TypeError,
12761 "%s argument has non-string str()");
12762 goto onError;
12763 }
12764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765 if (PyUnicode_READY(temp) == -1) {
12766 Py_CLEAR(temp);
12767 goto onError;
12768 }
12769 pbuf = PyUnicode_DATA(temp);
12770 kind = PyUnicode_KIND(temp);
12771 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 if (prec >= 0 && len > prec)
12773 len = prec;
12774 break;
12775
12776 case 'i':
12777 case 'd':
12778 case 'u':
12779 case 'o':
12780 case 'x':
12781 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012782 isnumok = 0;
12783 if (PyNumber_Check(v)) {
12784 PyObject *iobj=NULL;
12785
12786 if (PyLong_Check(v)) {
12787 iobj = v;
12788 Py_INCREF(iobj);
12789 }
12790 else {
12791 iobj = PyNumber_Long(v);
12792 }
12793 if (iobj!=NULL) {
12794 if (PyLong_Check(iobj)) {
12795 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012796 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 Py_DECREF(iobj);
12798 if (!temp)
12799 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800 if (PyUnicode_READY(temp) == -1) {
12801 Py_CLEAR(temp);
12802 goto onError;
12803 }
12804 pbuf = PyUnicode_DATA(temp);
12805 kind = PyUnicode_KIND(temp);
12806 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012807 sign = 1;
12808 }
12809 else {
12810 Py_DECREF(iobj);
12811 }
12812 }
12813 }
12814 if (!isnumok) {
12815 PyErr_Format(PyExc_TypeError,
12816 "%%%c format: a number is required, "
12817 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12818 goto onError;
12819 }
12820 if (flags & F_ZERO)
12821 fill = '0';
12822 break;
12823
12824 case 'e':
12825 case 'E':
12826 case 'f':
12827 case 'F':
12828 case 'g':
12829 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012830 temp = formatfloat(v, flags, prec, c);
12831 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012832 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012833 if (PyUnicode_READY(temp) == -1) {
12834 Py_CLEAR(temp);
12835 goto onError;
12836 }
12837 pbuf = PyUnicode_DATA(temp);
12838 kind = PyUnicode_KIND(temp);
12839 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 sign = 1;
12841 if (flags & F_ZERO)
12842 fill = '0';
12843 break;
12844
12845 case 'c':
12846 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012848 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012849 if (len < 0)
12850 goto onError;
12851 break;
12852
12853 default:
12854 PyErr_Format(PyExc_ValueError,
12855 "unsupported format character '%c' (0x%x) "
12856 "at index %zd",
12857 (31<=c && c<=126) ? (char)c : '?',
12858 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012859 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012860 goto onError;
12861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012862 /* pbuf is initialized here. */
12863 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012865 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12866 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12867 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012868 len--;
12869 }
12870 else if (flags & F_SIGN)
12871 sign = '+';
12872 else if (flags & F_BLANK)
12873 sign = ' ';
12874 else
12875 sign = 0;
12876 }
12877 if (width < len)
12878 width = len;
12879 if (rescnt - (sign != 0) < width) {
12880 reslen -= rescnt;
12881 rescnt = width + fmtcnt + 100;
12882 reslen += rescnt;
12883 if (reslen < 0) {
12884 Py_XDECREF(temp);
12885 PyErr_NoMemory();
12886 goto onError;
12887 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012888 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12889 if (res0 == 0) {
12890 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012891 Py_XDECREF(temp);
12892 goto onError;
12893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012895 }
12896 if (sign) {
12897 if (fill != ' ')
12898 *res++ = sign;
12899 rescnt--;
12900 if (width > len)
12901 width--;
12902 }
12903 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12905 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012906 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12908 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012909 }
12910 rescnt -= 2;
12911 width -= 2;
12912 if (width < 0)
12913 width = 0;
12914 len -= 2;
12915 }
12916 if (width > len && !(flags & F_LJUST)) {
12917 do {
12918 --rescnt;
12919 *res++ = fill;
12920 } while (--width > len);
12921 }
12922 if (fill == ' ') {
12923 if (sign)
12924 *res++ = sign;
12925 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012926 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12927 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12928 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12929 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012930 }
12931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 /* Copy all characters, preserving len */
12933 len1 = len;
12934 while (len1--) {
12935 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12936 rescnt--;
12937 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012938 while (--width >= len) {
12939 --rescnt;
12940 *res++ = ' ';
12941 }
12942 if (dict && (argidx < arglen) && c != '%') {
12943 PyErr_SetString(PyExc_TypeError,
12944 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012945 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012946 goto onError;
12947 }
12948 Py_XDECREF(temp);
12949 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012950 } /* until end */
12951 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012952 PyErr_SetString(PyExc_TypeError,
12953 "not all arguments converted during string formatting");
12954 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955 }
12956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957
12958 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12959 if (*res > max)
12960 max = *res;
12961 result = PyUnicode_New(reslen - rescnt, max);
12962 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012963 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 kind = PyUnicode_KIND(result);
12965 for (res = res0; res < res0+reslen-rescnt; res++)
12966 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12967 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012969 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970 }
12971 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012972 return (PyObject *)result;
12973
Benjamin Peterson29060642009-01-31 22:14:21 +000012974 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976 Py_DECREF(uformat);
12977 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012978 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012979 }
12980 return NULL;
12981}
12982
Jeremy Hylton938ace62002-07-17 16:30:39 +000012983static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012984unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12985
Tim Peters6d6c1a32001-08-02 04:15:00 +000012986static PyObject *
12987unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12988{
Benjamin Peterson29060642009-01-31 22:14:21 +000012989 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012990 static char *kwlist[] = {"object", "encoding", "errors", 0};
12991 char *encoding = NULL;
12992 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012993
Benjamin Peterson14339b62009-01-31 16:36:08 +000012994 if (type != &PyUnicode_Type)
12995 return unicode_subtype_new(type, args, kwds);
12996 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012997 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012998 return NULL;
12999 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013001 if (encoding == NULL && errors == NULL)
13002 return PyObject_Str(x);
13003 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013004 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013005}
13006
Guido van Rossume023fe02001-08-30 03:12:59 +000013007static PyObject *
13008unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13009{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013010 PyUnicodeObject *unicode, *self;
13011 Py_ssize_t length, char_size;
13012 int share_wstr, share_utf8;
13013 unsigned int kind;
13014 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013015
Benjamin Peterson14339b62009-01-31 16:36:08 +000013016 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013017
13018 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13019 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013020 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013021 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013022 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013023 return NULL;
13024
13025 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13026 if (self == NULL) {
13027 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013028 return NULL;
13029 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013030 kind = PyUnicode_KIND(unicode);
13031 length = PyUnicode_GET_LENGTH(unicode);
13032
13033 _PyUnicode_LENGTH(self) = length;
13034 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13035 _PyUnicode_STATE(self).interned = 0;
13036 _PyUnicode_STATE(self).kind = kind;
13037 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013038 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013039 _PyUnicode_STATE(self).ready = 1;
13040 _PyUnicode_WSTR(self) = NULL;
13041 _PyUnicode_UTF8_LENGTH(self) = 0;
13042 _PyUnicode_UTF8(self) = NULL;
13043 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013044 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013045
13046 share_utf8 = 0;
13047 share_wstr = 0;
13048 if (kind == PyUnicode_1BYTE_KIND) {
13049 char_size = 1;
13050 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13051 share_utf8 = 1;
13052 }
13053 else if (kind == PyUnicode_2BYTE_KIND) {
13054 char_size = 2;
13055 if (sizeof(wchar_t) == 2)
13056 share_wstr = 1;
13057 }
13058 else {
13059 assert(kind == PyUnicode_4BYTE_KIND);
13060 char_size = 4;
13061 if (sizeof(wchar_t) == 4)
13062 share_wstr = 1;
13063 }
13064
13065 /* Ensure we won't overflow the length. */
13066 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13067 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013068 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013069 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013070 data = PyObject_MALLOC((length + 1) * char_size);
13071 if (data == NULL) {
13072 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013073 goto onError;
13074 }
13075
Victor Stinnerc3c74152011-10-02 20:39:55 +020013076 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013077 if (share_utf8) {
13078 _PyUnicode_UTF8_LENGTH(self) = length;
13079 _PyUnicode_UTF8(self) = data;
13080 }
13081 if (share_wstr) {
13082 _PyUnicode_WSTR_LENGTH(self) = length;
13083 _PyUnicode_WSTR(self) = (wchar_t *)data;
13084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013086 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13087 PyUnicode_KIND_SIZE(kind, length + 1));
13088 Py_DECREF(unicode);
13089 return (PyObject *)self;
13090
13091onError:
13092 Py_DECREF(unicode);
13093 Py_DECREF(self);
13094 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013095}
13096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013097PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013098 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013099\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013100Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013101encoding defaults to the current default string encoding.\n\
13102errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013103
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013104static PyObject *unicode_iter(PyObject *seq);
13105
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013107 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013108 "str", /* tp_name */
13109 sizeof(PyUnicodeObject), /* tp_size */
13110 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013112 (destructor)unicode_dealloc, /* tp_dealloc */
13113 0, /* tp_print */
13114 0, /* tp_getattr */
13115 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013116 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013117 unicode_repr, /* tp_repr */
13118 &unicode_as_number, /* tp_as_number */
13119 &unicode_as_sequence, /* tp_as_sequence */
13120 &unicode_as_mapping, /* tp_as_mapping */
13121 (hashfunc) unicode_hash, /* tp_hash*/
13122 0, /* tp_call*/
13123 (reprfunc) unicode_str, /* tp_str */
13124 PyObject_GenericGetAttr, /* tp_getattro */
13125 0, /* tp_setattro */
13126 0, /* tp_as_buffer */
13127 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013128 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013129 unicode_doc, /* tp_doc */
13130 0, /* tp_traverse */
13131 0, /* tp_clear */
13132 PyUnicode_RichCompare, /* tp_richcompare */
13133 0, /* tp_weaklistoffset */
13134 unicode_iter, /* tp_iter */
13135 0, /* tp_iternext */
13136 unicode_methods, /* tp_methods */
13137 0, /* tp_members */
13138 0, /* tp_getset */
13139 &PyBaseObject_Type, /* tp_base */
13140 0, /* tp_dict */
13141 0, /* tp_descr_get */
13142 0, /* tp_descr_set */
13143 0, /* tp_dictoffset */
13144 0, /* tp_init */
13145 0, /* tp_alloc */
13146 unicode_new, /* tp_new */
13147 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148};
13149
13150/* Initialize the Unicode implementation */
13151
Thomas Wouters78890102000-07-22 19:25:51 +000013152void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013154 int i;
13155
Thomas Wouters477c8d52006-05-27 19:21:47 +000013156 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013157 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013158 0x000A, /* LINE FEED */
13159 0x000D, /* CARRIAGE RETURN */
13160 0x001C, /* FILE SEPARATOR */
13161 0x001D, /* GROUP SEPARATOR */
13162 0x001E, /* RECORD SEPARATOR */
13163 0x0085, /* NEXT LINE */
13164 0x2028, /* LINE SEPARATOR */
13165 0x2029, /* PARAGRAPH SEPARATOR */
13166 };
13167
Fred Drakee4315f52000-05-09 19:53:39 +000013168 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013169 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013170 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013172
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013173 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013174 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013175 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013176 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013177
13178 /* initialize the linebreak bloom filter */
13179 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013180 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013181 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013182
13183 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184}
13185
13186/* Finalize the Unicode implementation */
13187
Christian Heimesa156e092008-02-16 07:38:31 +000013188int
13189PyUnicode_ClearFreeList(void)
13190{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013192}
13193
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194void
Thomas Wouters78890102000-07-22 19:25:51 +000013195_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013196{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013197 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013199 Py_XDECREF(unicode_empty);
13200 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013201
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013202 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013203 if (unicode_latin1[i]) {
13204 Py_DECREF(unicode_latin1[i]);
13205 unicode_latin1[i] = NULL;
13206 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013207 }
Christian Heimesa156e092008-02-16 07:38:31 +000013208 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013210
Walter Dörwald16807132007-05-25 13:52:07 +000013211void
13212PyUnicode_InternInPlace(PyObject **p)
13213{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013214 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13215 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013216#ifdef Py_DEBUG
13217 assert(s != NULL);
13218 assert(_PyUnicode_CHECK(s));
13219#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013220 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013221 return;
13222#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013223 /* If it's a subclass, we don't really know what putting
13224 it in the interned dict might do. */
13225 if (!PyUnicode_CheckExact(s))
13226 return;
13227 if (PyUnicode_CHECK_INTERNED(s))
13228 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013229 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013230 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013231 return;
13232 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013233 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013234 if (interned == NULL) {
13235 interned = PyDict_New();
13236 if (interned == NULL) {
13237 PyErr_Clear(); /* Don't leave an exception */
13238 return;
13239 }
13240 }
13241 /* It might be that the GetItem call fails even
13242 though the key is present in the dictionary,
13243 namely when this happens during a stack overflow. */
13244 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013245 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013246 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013247
Benjamin Peterson29060642009-01-31 22:14:21 +000013248 if (t) {
13249 Py_INCREF(t);
13250 Py_DECREF(*p);
13251 *p = t;
13252 return;
13253 }
Walter Dörwald16807132007-05-25 13:52:07 +000013254
Benjamin Peterson14339b62009-01-31 16:36:08 +000013255 PyThreadState_GET()->recursion_critical = 1;
13256 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13257 PyErr_Clear();
13258 PyThreadState_GET()->recursion_critical = 0;
13259 return;
13260 }
13261 PyThreadState_GET()->recursion_critical = 0;
13262 /* The two references in interned are not counted by refcnt.
13263 The deallocator will take care of this */
13264 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013265 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013266}
13267
13268void
13269PyUnicode_InternImmortal(PyObject **p)
13270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013271 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13272
Benjamin Peterson14339b62009-01-31 16:36:08 +000013273 PyUnicode_InternInPlace(p);
13274 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013275 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013276 Py_INCREF(*p);
13277 }
Walter Dörwald16807132007-05-25 13:52:07 +000013278}
13279
13280PyObject *
13281PyUnicode_InternFromString(const char *cp)
13282{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013283 PyObject *s = PyUnicode_FromString(cp);
13284 if (s == NULL)
13285 return NULL;
13286 PyUnicode_InternInPlace(&s);
13287 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013288}
13289
Alexander Belopolsky40018472011-02-26 01:02:56 +000013290void
13291_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013292{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013293 PyObject *keys;
13294 PyUnicodeObject *s;
13295 Py_ssize_t i, n;
13296 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013297
Benjamin Peterson14339b62009-01-31 16:36:08 +000013298 if (interned == NULL || !PyDict_Check(interned))
13299 return;
13300 keys = PyDict_Keys(interned);
13301 if (keys == NULL || !PyList_Check(keys)) {
13302 PyErr_Clear();
13303 return;
13304 }
Walter Dörwald16807132007-05-25 13:52:07 +000013305
Benjamin Peterson14339b62009-01-31 16:36:08 +000013306 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13307 detector, interned unicode strings are not forcibly deallocated;
13308 rather, we give them their stolen references back, and then clear
13309 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013310
Benjamin Peterson14339b62009-01-31 16:36:08 +000013311 n = PyList_GET_SIZE(keys);
13312 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013314 for (i = 0; i < n; i++) {
13315 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013316 if (PyUnicode_READY(s) == -1) {
13317 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013320 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013321 case SSTATE_NOT_INTERNED:
13322 /* XXX Shouldn't happen */
13323 break;
13324 case SSTATE_INTERNED_IMMORTAL:
13325 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013326 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013327 break;
13328 case SSTATE_INTERNED_MORTAL:
13329 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013330 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013331 break;
13332 default:
13333 Py_FatalError("Inconsistent interned string state.");
13334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013335 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013336 }
13337 fprintf(stderr, "total size of all interned strings: "
13338 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13339 "mortal/immortal\n", mortal_size, immortal_size);
13340 Py_DECREF(keys);
13341 PyDict_Clear(interned);
13342 Py_DECREF(interned);
13343 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013344}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013345
13346
13347/********************* Unicode Iterator **************************/
13348
13349typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013350 PyObject_HEAD
13351 Py_ssize_t it_index;
13352 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013353} unicodeiterobject;
13354
13355static void
13356unicodeiter_dealloc(unicodeiterobject *it)
13357{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013358 _PyObject_GC_UNTRACK(it);
13359 Py_XDECREF(it->it_seq);
13360 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013361}
13362
13363static int
13364unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13365{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013366 Py_VISIT(it->it_seq);
13367 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013368}
13369
13370static PyObject *
13371unicodeiter_next(unicodeiterobject *it)
13372{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013373 PyUnicodeObject *seq;
13374 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013375
Benjamin Peterson14339b62009-01-31 16:36:08 +000013376 assert(it != NULL);
13377 seq = it->it_seq;
13378 if (seq == NULL)
13379 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013380 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013382 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13383 int kind = PyUnicode_KIND(seq);
13384 void *data = PyUnicode_DATA(seq);
13385 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13386 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013387 if (item != NULL)
13388 ++it->it_index;
13389 return item;
13390 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013391
Benjamin Peterson14339b62009-01-31 16:36:08 +000013392 Py_DECREF(seq);
13393 it->it_seq = NULL;
13394 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013395}
13396
13397static PyObject *
13398unicodeiter_len(unicodeiterobject *it)
13399{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013400 Py_ssize_t len = 0;
13401 if (it->it_seq)
13402 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13403 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013404}
13405
13406PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13407
13408static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013409 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013411 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013412};
13413
13414PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013415 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13416 "str_iterator", /* tp_name */
13417 sizeof(unicodeiterobject), /* tp_basicsize */
13418 0, /* tp_itemsize */
13419 /* methods */
13420 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13421 0, /* tp_print */
13422 0, /* tp_getattr */
13423 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013424 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013425 0, /* tp_repr */
13426 0, /* tp_as_number */
13427 0, /* tp_as_sequence */
13428 0, /* tp_as_mapping */
13429 0, /* tp_hash */
13430 0, /* tp_call */
13431 0, /* tp_str */
13432 PyObject_GenericGetAttr, /* tp_getattro */
13433 0, /* tp_setattro */
13434 0, /* tp_as_buffer */
13435 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13436 0, /* tp_doc */
13437 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13438 0, /* tp_clear */
13439 0, /* tp_richcompare */
13440 0, /* tp_weaklistoffset */
13441 PyObject_SelfIter, /* tp_iter */
13442 (iternextfunc)unicodeiter_next, /* tp_iternext */
13443 unicodeiter_methods, /* tp_methods */
13444 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013445};
13446
13447static PyObject *
13448unicode_iter(PyObject *seq)
13449{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013450 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013451
Benjamin Peterson14339b62009-01-31 16:36:08 +000013452 if (!PyUnicode_Check(seq)) {
13453 PyErr_BadInternalCall();
13454 return NULL;
13455 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013456 if (PyUnicode_READY(seq) == -1)
13457 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013458 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13459 if (it == NULL)
13460 return NULL;
13461 it->it_index = 0;
13462 Py_INCREF(seq);
13463 it->it_seq = (PyUnicodeObject *)seq;
13464 _PyObject_GC_TRACK(it);
13465 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013466}
13467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013468#define UNIOP(x) Py_UNICODE_##x
13469#define UNIOP_t Py_UNICODE
13470#include "uniops.h"
13471#undef UNIOP
13472#undef UNIOP_t
13473#define UNIOP(x) Py_UCS4_##x
13474#define UNIOP_t Py_UCS4
13475#include "uniops.h"
13476#undef UNIOP
13477#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013478
Victor Stinner71133ff2010-09-01 23:43:53 +000013479Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013480PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013481{
13482 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13483 Py_UNICODE *copy;
13484 Py_ssize_t size;
13485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013486 if (!PyUnicode_Check(unicode)) {
13487 PyErr_BadArgument();
13488 return NULL;
13489 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013490 /* Ensure we won't overflow the size. */
13491 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13492 PyErr_NoMemory();
13493 return NULL;
13494 }
13495 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13496 size *= sizeof(Py_UNICODE);
13497 copy = PyMem_Malloc(size);
13498 if (copy == NULL) {
13499 PyErr_NoMemory();
13500 return NULL;
13501 }
13502 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13503 return copy;
13504}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013505
Georg Brandl66c221e2010-10-14 07:04:07 +000013506/* A _string module, to export formatter_parser and formatter_field_name_split
13507 to the string.Formatter class implemented in Python. */
13508
13509static PyMethodDef _string_methods[] = {
13510 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13511 METH_O, PyDoc_STR("split the argument as a field name")},
13512 {"formatter_parser", (PyCFunction) formatter_parser,
13513 METH_O, PyDoc_STR("parse the argument as a format string")},
13514 {NULL, NULL}
13515};
13516
13517static struct PyModuleDef _string_module = {
13518 PyModuleDef_HEAD_INIT,
13519 "_string",
13520 PyDoc_STR("string helper module"),
13521 0,
13522 _string_methods,
13523 NULL,
13524 NULL,
13525 NULL,
13526 NULL
13527};
13528
13529PyMODINIT_FUNC
13530PyInit__string(void)
13531{
13532 return PyModule_Create(&_string_module);
13533}
13534
13535
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013536#ifdef __cplusplus
13537}
13538#endif