blob: 364de90877c24c34c40c32416713e0b946e4cc50 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200378 void *data;
379 Py_UCS4 ch;
380
381 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 for (i=0; i < ascii->length; i++)
383 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200384 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200385 if (ch > maxchar)
386 maxchar = ch;
387 }
388 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100389 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100391 assert(maxchar <= 255);
392 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 else
394 assert(maxchar < 128);
395 }
Victor Stinner77faf692011-11-20 18:56:05 +0100396 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100398 assert(maxchar <= 0xFFFF);
399 }
400 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100402 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100403 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200404 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400406 return 1;
407}
Victor Stinner910337b2011-10-03 03:20:16 +0200408#endif
409
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100410static PyObject*
411unicode_result_wchar(PyObject *unicode)
412{
413#ifndef Py_DEBUG
414 Py_ssize_t len;
415
416 assert(Py_REFCNT(unicode) == 1);
417
418 len = _PyUnicode_WSTR_LENGTH(unicode);
419 if (len == 0) {
420 Py_INCREF(unicode_empty);
421 Py_DECREF(unicode);
422 return unicode_empty;
423 }
424
425 if (len == 1) {
426 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
427 if (ch < 256) {
428 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
429 Py_DECREF(unicode);
430 return latin1_char;
431 }
432 }
433
434 if (_PyUnicode_Ready(unicode) < 0) {
435 Py_XDECREF(unicode);
436 return NULL;
437 }
438#else
439 /* don't make the result ready in debug mode to ensure that the caller
440 makes the string ready before using it */
441 assert(_PyUnicode_CheckConsistency(unicode, 1));
442#endif
443 return unicode;
444}
445
446static PyObject*
447unicode_result_ready(PyObject *unicode)
448{
449 Py_ssize_t length;
450
451 length = PyUnicode_GET_LENGTH(unicode);
452 if (length == 0) {
453 if (unicode != unicode_empty) {
454 Py_INCREF(unicode_empty);
455 Py_DECREF(unicode);
456 }
457 return unicode_empty;
458 }
459
460 if (length == 1) {
461 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
462 if (ch < 256) {
463 PyObject *latin1_char = unicode_latin1[ch];
464 if (latin1_char != NULL) {
465 if (unicode != latin1_char) {
466 Py_INCREF(latin1_char);
467 Py_DECREF(unicode);
468 }
469 return latin1_char;
470 }
471 else {
472 assert(_PyUnicode_CheckConsistency(unicode, 1));
473 Py_INCREF(unicode);
474 unicode_latin1[ch] = unicode;
475 return unicode;
476 }
477 }
478 }
479
480 assert(_PyUnicode_CheckConsistency(unicode, 1));
481 return unicode;
482}
483
484static PyObject*
485unicode_result(PyObject *unicode)
486{
487 assert(_PyUnicode_CHECK(unicode));
488 if (PyUnicode_IS_READY(unicode))
489 return unicode_result_ready(unicode);
490 else
491 return unicode_result_wchar(unicode);
492}
493
Victor Stinnerc4b49542011-12-11 22:44:26 +0100494static PyObject*
495unicode_result_unchanged(PyObject *unicode)
496{
497 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500498 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100499 return NULL;
500 Py_INCREF(unicode);
501 return unicode;
502 }
503 else
504 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100505 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100506}
507
Victor Stinner3a50e702011-10-18 21:21:00 +0200508#ifdef HAVE_MBCS
509static OSVERSIONINFOEX winver;
510#endif
511
Thomas Wouters477c8d52006-05-27 19:21:47 +0000512/* --- Bloom Filters ----------------------------------------------------- */
513
514/* stuff to implement simple "bloom filters" for Unicode characters.
515 to keep things simple, we use a single bitmask, using the least 5
516 bits from each unicode characters as the bit index. */
517
518/* the linebreak mask is set up by Unicode_Init below */
519
Antoine Pitrouf068f942010-01-13 14:19:12 +0000520#if LONG_BIT >= 128
521#define BLOOM_WIDTH 128
522#elif LONG_BIT >= 64
523#define BLOOM_WIDTH 64
524#elif LONG_BIT >= 32
525#define BLOOM_WIDTH 32
526#else
527#error "LONG_BIT is smaller than 32"
528#endif
529
Thomas Wouters477c8d52006-05-27 19:21:47 +0000530#define BLOOM_MASK unsigned long
531
532static BLOOM_MASK bloom_linebreak;
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
535#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Benjamin Peterson29060642009-01-31 22:14:21 +0000537#define BLOOM_LINEBREAK(ch) \
538 ((ch) < 128U ? ascii_linebreak[(ch)] : \
539 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000540
Alexander Belopolsky40018472011-02-26 01:02:56 +0000541Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543{
544 /* calculate simple bloom-style bitmask for a given unicode string */
545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547 Py_ssize_t i;
548
549 mask = 0;
550 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552
553 return mask;
554}
555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556#define BLOOM_MEMBER(mask, chr, str) \
557 (BLOOM(mask, chr) \
558 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000559
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200560/* Compilation of templated routines */
561
562#include "stringlib/asciilib.h"
563#include "stringlib/fastsearch.h"
564#include "stringlib/partition.h"
565#include "stringlib/split.h"
566#include "stringlib/count.h"
567#include "stringlib/find.h"
568#include "stringlib/find_max_char.h"
569#include "stringlib/localeutil.h"
570#include "stringlib/undef.h"
571
572#include "stringlib/ucs1lib.h"
573#include "stringlib/fastsearch.h"
574#include "stringlib/partition.h"
575#include "stringlib/split.h"
576#include "stringlib/count.h"
577#include "stringlib/find.h"
578#include "stringlib/find_max_char.h"
579#include "stringlib/localeutil.h"
580#include "stringlib/undef.h"
581
582#include "stringlib/ucs2lib.h"
583#include "stringlib/fastsearch.h"
584#include "stringlib/partition.h"
585#include "stringlib/split.h"
586#include "stringlib/count.h"
587#include "stringlib/find.h"
588#include "stringlib/find_max_char.h"
589#include "stringlib/localeutil.h"
590#include "stringlib/undef.h"
591
592#include "stringlib/ucs4lib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200602#include "stringlib/unicodedefs.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/count.h"
605#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100606#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200607
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608/* --- Unicode Object ----------------------------------------------------- */
609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200611fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
614 Py_ssize_t size, Py_UCS4 ch,
615 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200616{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200617 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
618
619 switch (kind) {
620 case PyUnicode_1BYTE_KIND:
621 {
622 Py_UCS1 ch1 = (Py_UCS1) ch;
623 if (ch1 == ch)
624 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
625 else
626 return -1;
627 }
628 case PyUnicode_2BYTE_KIND:
629 {
630 Py_UCS2 ch2 = (Py_UCS2) ch;
631 if (ch2 == ch)
632 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
633 else
634 return -1;
635 }
636 case PyUnicode_4BYTE_KIND:
637 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
638 default:
639 assert(0);
640 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200642}
643
Victor Stinnerfe226c02011-10-03 03:52:20 +0200644static PyObject*
645resize_compact(PyObject *unicode, Py_ssize_t length)
646{
647 Py_ssize_t char_size;
648 Py_ssize_t struct_size;
649 Py_ssize_t new_size;
650 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100651 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100653 assert(PyUnicode_IS_COMPACT(unicode));
654
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200655 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100656 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 struct_size = sizeof(PyASCIIObject);
658 else
659 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200660 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
663 PyErr_NoMemory();
664 return NULL;
665 }
666 new_size = (struct_size + (length + 1) * char_size);
667
Victor Stinner84def372011-12-11 20:04:56 +0100668 _Py_DEC_REFTOTAL;
669 _Py_ForgetReference(unicode);
670
671 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
672 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100673 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 PyErr_NoMemory();
675 return NULL;
676 }
Victor Stinner84def372011-12-11 20:04:56 +0100677 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100679
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200681 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100683 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200684 _PyUnicode_WSTR_LENGTH(unicode) = length;
685 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
687 length, 0);
688 return unicode;
689}
690
Alexander Belopolsky40018472011-02-26 01:02:56 +0000691static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200692resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693{
Victor Stinner95663112011-10-04 01:03:50 +0200694 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100695 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200696 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000698
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699 if (PyUnicode_IS_READY(unicode)) {
700 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200701 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 void *data;
703
704 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200705 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200706 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
707 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200708
709 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
710 PyErr_NoMemory();
711 return -1;
712 }
713 new_size = (length + 1) * char_size;
714
Victor Stinner7a9105a2011-12-12 00:13:42 +0100715 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
716 {
717 PyObject_DEL(_PyUnicode_UTF8(unicode));
718 _PyUnicode_UTF8(unicode) = NULL;
719 _PyUnicode_UTF8_LENGTH(unicode) = 0;
720 }
721
Victor Stinnerfe226c02011-10-03 03:52:20 +0200722 data = (PyObject *)PyObject_REALLOC(data, new_size);
723 if (data == NULL) {
724 PyErr_NoMemory();
725 return -1;
726 }
727 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200728 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200729 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_WSTR_LENGTH(unicode) = length;
731 }
732 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200733 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200734 _PyUnicode_UTF8_LENGTH(unicode) = length;
735 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_LENGTH(unicode) = length;
737 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200738 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200739 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 }
Victor Stinner95663112011-10-04 01:03:50 +0200743 assert(_PyUnicode_WSTR(unicode) != NULL);
744
745 /* check for integer overflow */
746 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
747 PyErr_NoMemory();
748 return -1;
749 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100750 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200751 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100752 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200753 if (!wstr) {
754 PyErr_NoMemory();
755 return -1;
756 }
757 _PyUnicode_WSTR(unicode) = wstr;
758 _PyUnicode_WSTR(unicode)[length] = 0;
759 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200760 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761 return 0;
762}
763
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764static PyObject*
765resize_copy(PyObject *unicode, Py_ssize_t length)
766{
767 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100770
Benjamin Petersonbac79492012-01-14 13:34:47 -0500771 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100772 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
775 if (copy == NULL)
776 return NULL;
777
778 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200779 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200780 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200781 }
782 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200783 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100784
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200785 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200786 if (w == NULL)
787 return NULL;
788 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
789 copy_length = Py_MIN(copy_length, length);
790 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
791 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200792 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793 }
794}
795
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000797 Ux0000 terminated; some code (e.g. new_identifier)
798 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000799
800 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000801 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802
803*/
804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200805#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200806static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807#endif
808
Alexander Belopolsky40018472011-02-26 01:02:56 +0000809static PyUnicodeObject *
810_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811{
812 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814
Thomas Wouters477c8d52006-05-27 19:21:47 +0000815 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816 if (length == 0 && unicode_empty != NULL) {
817 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200818 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819 }
820
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000821 /* Ensure we won't overflow the size. */
822 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
823 return (PyUnicodeObject *)PyErr_NoMemory();
824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200825 if (length < 0) {
826 PyErr_SetString(PyExc_SystemError,
827 "Negative size passed to _PyUnicode_New");
828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829 }
830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831#ifdef Py_DEBUG
832 ++unicode_old_new_calls;
833#endif
834
835 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
836 if (unicode == NULL)
837 return NULL;
838 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
839 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
840 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100841 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000842 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100843 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200845
Jeremy Hyltond8082792003-09-16 19:41:39 +0000846 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000847 * the caller fails before initializing str -- unicode_resize()
848 * reads str[0], and the Keep-Alive optimization can keep memory
849 * allocated for str alive across a call to unicode_dealloc(unicode).
850 * We don't want unicode_resize to read uninitialized memory in
851 * that case.
852 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200853 _PyUnicode_WSTR(unicode)[0] = 0;
854 _PyUnicode_WSTR(unicode)[length] = 0;
855 _PyUnicode_WSTR_LENGTH(unicode) = length;
856 _PyUnicode_HASH(unicode) = -1;
857 _PyUnicode_STATE(unicode).interned = 0;
858 _PyUnicode_STATE(unicode).kind = 0;
859 _PyUnicode_STATE(unicode).compact = 0;
860 _PyUnicode_STATE(unicode).ready = 0;
861 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200862 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200863 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200864 _PyUnicode_UTF8(unicode) = NULL;
865 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100866 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867 return unicode;
868}
869
Victor Stinnerf42dc442011-10-02 23:33:16 +0200870static const char*
871unicode_kind_name(PyObject *unicode)
872{
Victor Stinner42dfd712011-10-03 14:41:45 +0200873 /* don't check consistency: unicode_kind_name() is called from
874 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200875 if (!PyUnicode_IS_COMPACT(unicode))
876 {
877 if (!PyUnicode_IS_READY(unicode))
878 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600879 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200880 {
881 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200882 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200883 return "legacy ascii";
884 else
885 return "legacy latin1";
886 case PyUnicode_2BYTE_KIND:
887 return "legacy UCS2";
888 case PyUnicode_4BYTE_KIND:
889 return "legacy UCS4";
890 default:
891 return "<legacy invalid kind>";
892 }
893 }
894 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600895 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200897 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200898 return "ascii";
899 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200902 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200903 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200904 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200905 default:
906 return "<invalid compact kind>";
907 }
908}
909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200910#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200911static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912
913/* Functions wrapping macros for use in debugger */
914char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200915 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200916}
917
918void *_PyUnicode_compact_data(void *unicode) {
919 return _PyUnicode_COMPACT_DATA(unicode);
920}
921void *_PyUnicode_data(void *unicode){
922 printf("obj %p\n", unicode);
923 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
924 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
925 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
926 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
927 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
928 return PyUnicode_DATA(unicode);
929}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200930
931void
932_PyUnicode_Dump(PyObject *op)
933{
934 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
936 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
937 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200938
Victor Stinnera849a4b2011-10-03 12:12:11 +0200939 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200940 {
941 if (ascii->state.ascii)
942 data = (ascii + 1);
943 else
944 data = (compact + 1);
945 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 else
947 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200948 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
949
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 if (ascii->wstr == data)
951 printf("shared ");
952 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200953
Victor Stinnera3b334d2011-10-03 13:53:37 +0200954 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 printf(" (%zu), ", compact->wstr_length);
956 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
957 printf("shared ");
958 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200959 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200960 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200961}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200962#endif
963
964PyObject *
965PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
966{
967 PyObject *obj;
968 PyCompactUnicodeObject *unicode;
969 void *data;
970 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200971 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 Py_ssize_t char_size;
973 Py_ssize_t struct_size;
974
975 /* Optimization for empty strings */
976 if (size == 0 && unicode_empty != NULL) {
977 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200978 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 }
980
981#ifdef Py_DEBUG
982 ++unicode_new_new_calls;
983#endif
984
Victor Stinner9e9d6892011-10-04 01:02:02 +0200985 is_ascii = 0;
986 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 struct_size = sizeof(PyCompactUnicodeObject);
988 if (maxchar < 128) {
989 kind_state = PyUnicode_1BYTE_KIND;
990 char_size = 1;
991 is_ascii = 1;
992 struct_size = sizeof(PyASCIIObject);
993 }
994 else if (maxchar < 256) {
995 kind_state = PyUnicode_1BYTE_KIND;
996 char_size = 1;
997 }
998 else if (maxchar < 65536) {
999 kind_state = PyUnicode_2BYTE_KIND;
1000 char_size = 2;
1001 if (sizeof(wchar_t) == 2)
1002 is_sharing = 1;
1003 }
1004 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001005 if (maxchar > MAX_UNICODE) {
1006 PyErr_SetString(PyExc_SystemError,
1007 "invalid maximum character passed to PyUnicode_New");
1008 return NULL;
1009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001010 kind_state = PyUnicode_4BYTE_KIND;
1011 char_size = 4;
1012 if (sizeof(wchar_t) == 4)
1013 is_sharing = 1;
1014 }
1015
1016 /* Ensure we won't overflow the size. */
1017 if (size < 0) {
1018 PyErr_SetString(PyExc_SystemError,
1019 "Negative size passed to PyUnicode_New");
1020 return NULL;
1021 }
1022 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1023 return PyErr_NoMemory();
1024
1025 /* Duplicated allocation code from _PyObject_New() instead of a call to
1026 * PyObject_New() so we are able to allocate space for the object and
1027 * it's data buffer.
1028 */
1029 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1030 if (obj == NULL)
1031 return PyErr_NoMemory();
1032 obj = PyObject_INIT(obj, &PyUnicode_Type);
1033 if (obj == NULL)
1034 return NULL;
1035
1036 unicode = (PyCompactUnicodeObject *)obj;
1037 if (is_ascii)
1038 data = ((PyASCIIObject*)obj) + 1;
1039 else
1040 data = unicode + 1;
1041 _PyUnicode_LENGTH(unicode) = size;
1042 _PyUnicode_HASH(unicode) = -1;
1043 _PyUnicode_STATE(unicode).interned = 0;
1044 _PyUnicode_STATE(unicode).kind = kind_state;
1045 _PyUnicode_STATE(unicode).compact = 1;
1046 _PyUnicode_STATE(unicode).ready = 1;
1047 _PyUnicode_STATE(unicode).ascii = is_ascii;
1048 if (is_ascii) {
1049 ((char*)data)[size] = 0;
1050 _PyUnicode_WSTR(unicode) = NULL;
1051 }
1052 else if (kind_state == PyUnicode_1BYTE_KIND) {
1053 ((char*)data)[size] = 0;
1054 _PyUnicode_WSTR(unicode) = NULL;
1055 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001057 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 }
1059 else {
1060 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001061 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 if (kind_state == PyUnicode_2BYTE_KIND)
1063 ((Py_UCS2*)data)[size] = 0;
1064 else /* kind_state == PyUnicode_4BYTE_KIND */
1065 ((Py_UCS4*)data)[size] = 0;
1066 if (is_sharing) {
1067 _PyUnicode_WSTR_LENGTH(unicode) = size;
1068 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1069 }
1070 else {
1071 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1072 _PyUnicode_WSTR(unicode) = NULL;
1073 }
1074 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001075 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001076 return obj;
1077}
1078
1079#if SIZEOF_WCHAR_T == 2
1080/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1081 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001082 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083
1084 This function assumes that unicode can hold one more code point than wstr
1085 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001086static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001088 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089{
1090 const wchar_t *iter;
1091 Py_UCS4 *ucs4_out;
1092
Victor Stinner910337b2011-10-03 03:20:16 +02001093 assert(unicode != NULL);
1094 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1096 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1097
1098 for (iter = begin; iter < end; ) {
1099 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1100 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001101 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1102 && (iter+1) < end
1103 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 {
Victor Stinner551ac952011-11-29 22:58:13 +01001105 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 iter += 2;
1107 }
1108 else {
1109 *ucs4_out++ = *iter;
1110 iter++;
1111 }
1112 }
1113 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1114 _PyUnicode_GET_LENGTH(unicode)));
1115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116}
1117#endif
1118
Victor Stinnercd9950f2011-10-02 00:34:53 +02001119static int
Victor Stinner488fa492011-12-12 00:01:39 +01001120unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001121{
Victor Stinner488fa492011-12-12 00:01:39 +01001122 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001123 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001124 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001125 return -1;
1126 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001127 return 0;
1128}
1129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130static int
1131_copy_characters(PyObject *to, Py_ssize_t to_start,
1132 PyObject *from, Py_ssize_t from_start,
1133 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001135 unsigned int from_kind, to_kind;
1136 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001137 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001139 assert(PyUnicode_Check(from));
1140 assert(PyUnicode_Check(to));
1141 assert(PyUnicode_IS_READY(from));
1142 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1145 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1146 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001148 if (how_many == 0)
1149 return 0;
1150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001152 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001154 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156#ifdef Py_DEBUG
1157 if (!check_maxchar
1158 && (from_kind > to_kind
1159 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001160 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001161 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1162 Py_UCS4 ch;
1163 Py_ssize_t i;
1164 for (i=0; i < how_many; i++) {
1165 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1166 assert(ch <= to_maxchar);
1167 }
1168 }
1169#endif
1170 fast = (from_kind == to_kind);
1171 if (check_maxchar
1172 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1173 {
1174 /* deny latin1 => ascii */
1175 fast = 0;
1176 }
1177
1178 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001179 Py_MEMCPY((char*)to_data + to_kind * to_start,
1180 (char*)from_data + from_kind * from_start,
1181 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001183 else if (from_kind == PyUnicode_1BYTE_KIND
1184 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 {
1186 _PyUnicode_CONVERT_BYTES(
1187 Py_UCS1, Py_UCS2,
1188 PyUnicode_1BYTE_DATA(from) + from_start,
1189 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1190 PyUnicode_2BYTE_DATA(to) + to_start
1191 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001192 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001193 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001194 && to_kind == PyUnicode_4BYTE_KIND)
1195 {
1196 _PyUnicode_CONVERT_BYTES(
1197 Py_UCS1, Py_UCS4,
1198 PyUnicode_1BYTE_DATA(from) + from_start,
1199 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1200 PyUnicode_4BYTE_DATA(to) + to_start
1201 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001202 }
1203 else if (from_kind == PyUnicode_2BYTE_KIND
1204 && to_kind == PyUnicode_4BYTE_KIND)
1205 {
1206 _PyUnicode_CONVERT_BYTES(
1207 Py_UCS2, Py_UCS4,
1208 PyUnicode_2BYTE_DATA(from) + from_start,
1209 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1210 PyUnicode_4BYTE_DATA(to) + to_start
1211 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001212 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001214 /* check if max_char(from substring) <= max_char(to) */
1215 if (from_kind > to_kind
1216 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001217 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001218 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 /* slow path to check for character overflow */
1220 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001221 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001222 Py_ssize_t i;
1223
Victor Stinner56c161a2011-10-06 02:47:11 +02001224#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001225 for (i=0; i < how_many; i++) {
1226 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001227 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001228 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1229 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001230#else
1231 if (!check_maxchar) {
1232 for (i=0; i < how_many; i++) {
1233 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1234 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1235 }
1236 }
1237 else {
1238 for (i=0; i < how_many; i++) {
1239 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1240 if (ch > to_maxchar)
1241 return 1;
1242 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1243 }
1244 }
1245#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001246 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001247 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001248 assert(0 && "inconsistent state");
1249 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001250 }
1251 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001252 return 0;
1253}
1254
1255static void
1256copy_characters(PyObject *to, Py_ssize_t to_start,
1257 PyObject *from, Py_ssize_t from_start,
1258 Py_ssize_t how_many)
1259{
1260 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1261}
1262
1263Py_ssize_t
1264PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1265 PyObject *from, Py_ssize_t from_start,
1266 Py_ssize_t how_many)
1267{
1268 int err;
1269
1270 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1271 PyErr_BadInternalCall();
1272 return -1;
1273 }
1274
Benjamin Petersonbac79492012-01-14 13:34:47 -05001275 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001276 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001277 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001278 return -1;
1279
1280 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1281 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1282 PyErr_Format(PyExc_SystemError,
1283 "Cannot write %zi characters at %zi "
1284 "in a string of %zi characters",
1285 how_many, to_start, PyUnicode_GET_LENGTH(to));
1286 return -1;
1287 }
1288
1289 if (how_many == 0)
1290 return 0;
1291
Victor Stinner488fa492011-12-12 00:01:39 +01001292 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001293 return -1;
1294
1295 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1296 if (err) {
1297 PyErr_Format(PyExc_SystemError,
1298 "Cannot copy %s characters "
1299 "into a string of %s characters",
1300 unicode_kind_name(from),
1301 unicode_kind_name(to));
1302 return -1;
1303 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001304 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305}
1306
Victor Stinner17222162011-09-28 22:15:37 +02001307/* Find the maximum code point and count the number of surrogate pairs so a
1308 correct string length can be computed before converting a string to UCS4.
1309 This function counts single surrogates as a character and not as a pair.
1310
1311 Return 0 on success, or -1 on error. */
1312static int
1313find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1314 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315{
1316 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001317 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318
Victor Stinnerc53be962011-10-02 21:33:54 +02001319 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 *num_surrogates = 0;
1321 *maxchar = 0;
1322
1323 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001325 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1326 && (iter+1) < end
1327 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001329 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 iter += 2;
1332 }
1333 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001335 {
1336 ch = *iter;
1337 iter++;
1338 }
1339 if (ch > *maxchar) {
1340 *maxchar = ch;
1341 if (*maxchar > MAX_UNICODE) {
1342 PyErr_Format(PyExc_ValueError,
1343 "character U+%x is not in range [U+0000; U+10ffff]",
1344 ch);
1345 return -1;
1346 }
1347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348 }
1349 return 0;
1350}
1351
1352#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001353static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354#endif
1355
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001356int
1357_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358{
1359 wchar_t *end;
1360 Py_UCS4 maxchar = 0;
1361 Py_ssize_t num_surrogates;
1362#if SIZEOF_WCHAR_T == 2
1363 Py_ssize_t length_wo_surrogates;
1364#endif
1365
Georg Brandl7597add2011-10-05 16:36:47 +02001366 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001367 strings were created using _PyObject_New() and where no canonical
1368 representation (the str field) has been set yet aka strings
1369 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001370 assert(_PyUnicode_CHECK(unicode));
1371 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001373 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001374 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 /* Actually, it should neither be interned nor be anything else: */
1376 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377
1378#ifdef Py_DEBUG
1379 ++unicode_ready_calls;
1380#endif
1381
1382 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001383 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001384 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386
1387 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001388 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1389 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 PyErr_NoMemory();
1391 return -1;
1392 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001393 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 _PyUnicode_WSTR(unicode), end,
1395 PyUnicode_1BYTE_DATA(unicode));
1396 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1397 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1398 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1399 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001400 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001401 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001402 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 }
1404 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001405 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001406 _PyUnicode_UTF8(unicode) = NULL;
1407 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 }
1409 PyObject_FREE(_PyUnicode_WSTR(unicode));
1410 _PyUnicode_WSTR(unicode) = NULL;
1411 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1412 }
1413 /* In this case we might have to convert down from 4-byte native
1414 wchar_t to 2-byte unicode. */
1415 else if (maxchar < 65536) {
1416 assert(num_surrogates == 0 &&
1417 "FindMaxCharAndNumSurrogatePairs() messed up");
1418
Victor Stinner506f5922011-09-28 22:34:18 +02001419#if SIZEOF_WCHAR_T == 2
1420 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001421 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001422 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1423 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1424 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001425 _PyUnicode_UTF8(unicode) = NULL;
1426 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001427#else
1428 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001429 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001430 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001431 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001432 PyErr_NoMemory();
1433 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 }
Victor Stinner506f5922011-09-28 22:34:18 +02001435 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1436 _PyUnicode_WSTR(unicode), end,
1437 PyUnicode_2BYTE_DATA(unicode));
1438 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1439 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1440 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001441 _PyUnicode_UTF8(unicode) = NULL;
1442 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001443 PyObject_FREE(_PyUnicode_WSTR(unicode));
1444 _PyUnicode_WSTR(unicode) = NULL;
1445 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1446#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 }
1448 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1449 else {
1450#if SIZEOF_WCHAR_T == 2
1451 /* in case the native representation is 2-bytes, we need to allocate a
1452 new normalized 4-byte version. */
1453 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001454 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1455 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 PyErr_NoMemory();
1457 return -1;
1458 }
1459 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1460 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001461 _PyUnicode_UTF8(unicode) = NULL;
1462 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001463 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1464 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001465 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 PyObject_FREE(_PyUnicode_WSTR(unicode));
1467 _PyUnicode_WSTR(unicode) = NULL;
1468 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1469#else
1470 assert(num_surrogates == 0);
1471
Victor Stinnerc3c74152011-10-02 20:39:55 +02001472 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001474 _PyUnicode_UTF8(unicode) = NULL;
1475 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1477#endif
1478 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1479 }
1480 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001481 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 return 0;
1483}
1484
Alexander Belopolsky40018472011-02-26 01:02:56 +00001485static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001486unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487{
Walter Dörwald16807132007-05-25 13:52:07 +00001488 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001489 case SSTATE_NOT_INTERNED:
1490 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001491
Benjamin Peterson29060642009-01-31 22:14:21 +00001492 case SSTATE_INTERNED_MORTAL:
1493 /* revive dead object temporarily for DelItem */
1494 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001495 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 Py_FatalError(
1497 "deletion of interned string failed");
1498 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001499
Benjamin Peterson29060642009-01-31 22:14:21 +00001500 case SSTATE_INTERNED_IMMORTAL:
1501 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001502
Benjamin Peterson29060642009-01-31 22:14:21 +00001503 default:
1504 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001505 }
1506
Victor Stinner03490912011-10-03 23:45:12 +02001507 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001508 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001509 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001510 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001511 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1512 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001513
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001514 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515}
1516
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001517#ifdef Py_DEBUG
1518static int
1519unicode_is_singleton(PyObject *unicode)
1520{
1521 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1522 if (unicode == unicode_empty)
1523 return 1;
1524 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1525 {
1526 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1527 if (ch < 256 && unicode_latin1[ch] == unicode)
1528 return 1;
1529 }
1530 return 0;
1531}
1532#endif
1533
Alexander Belopolsky40018472011-02-26 01:02:56 +00001534static int
Victor Stinner488fa492011-12-12 00:01:39 +01001535unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001536{
Victor Stinner488fa492011-12-12 00:01:39 +01001537 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001538 if (Py_REFCNT(unicode) != 1)
1539 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001540 if (_PyUnicode_HASH(unicode) != -1)
1541 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001542 if (PyUnicode_CHECK_INTERNED(unicode))
1543 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001544 if (!PyUnicode_CheckExact(unicode))
1545 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001546#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 /* singleton refcount is greater than 1 */
1548 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001549#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001550 return 1;
1551}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001552
Victor Stinnerfe226c02011-10-03 03:52:20 +02001553static int
1554unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1555{
1556 PyObject *unicode;
1557 Py_ssize_t old_length;
1558
1559 assert(p_unicode != NULL);
1560 unicode = *p_unicode;
1561
1562 assert(unicode != NULL);
1563 assert(PyUnicode_Check(unicode));
1564 assert(0 <= length);
1565
Victor Stinner910337b2011-10-03 03:20:16 +02001566 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001567 old_length = PyUnicode_WSTR_LENGTH(unicode);
1568 else
1569 old_length = PyUnicode_GET_LENGTH(unicode);
1570 if (old_length == length)
1571 return 0;
1572
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001573 if (length == 0) {
1574 Py_DECREF(*p_unicode);
1575 *p_unicode = unicode_empty;
1576 Py_INCREF(*p_unicode);
1577 return 0;
1578 }
1579
Victor Stinner488fa492011-12-12 00:01:39 +01001580 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 PyObject *copy = resize_copy(unicode, length);
1582 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001583 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 Py_DECREF(*p_unicode);
1585 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001587 }
1588
Victor Stinnerfe226c02011-10-03 03:52:20 +02001589 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001590 PyObject *new_unicode = resize_compact(unicode, length);
1591 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001593 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001594 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001596 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001597 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001598}
1599
Alexander Belopolsky40018472011-02-26 01:02:56 +00001600int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001602{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001603 PyObject *unicode;
1604 if (p_unicode == NULL) {
1605 PyErr_BadInternalCall();
1606 return -1;
1607 }
1608 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001609 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001610 {
1611 PyErr_BadInternalCall();
1612 return -1;
1613 }
1614 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001615}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001616
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001617static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001618unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001619{
1620 PyObject *result;
1621 assert(PyUnicode_IS_READY(*p_unicode));
1622 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1623 return 0;
1624 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1625 maxchar);
1626 if (result == NULL)
1627 return -1;
1628 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1629 PyUnicode_GET_LENGTH(*p_unicode));
1630 Py_DECREF(*p_unicode);
1631 *p_unicode = result;
1632 return 0;
1633}
1634
1635static int
1636unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1637 Py_UCS4 ch)
1638{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001639 assert(ch <= MAX_UNICODE);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001640 if (unicode_widen(p_unicode, ch) < 0)
1641 return -1;
1642 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1643 PyUnicode_DATA(*p_unicode),
1644 (*pos)++, ch);
1645 return 0;
1646}
1647
Victor Stinnerc5166102012-02-22 13:55:02 +01001648/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1649 Return the length of the input string.
1650
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001651 WARNING: The function doesn't copy the terminating null character and
1652 doesn't check the maximum character (may write a latin1 character in an
1653 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001654static Py_ssize_t
1655unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1656{
1657 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1658 void *data = PyUnicode_DATA(unicode);
1659
1660 switch (kind) {
1661 case PyUnicode_1BYTE_KIND: {
1662 Py_ssize_t len = strlen(str);
1663 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001664 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001665 return len;
1666 }
1667 case PyUnicode_2BYTE_KIND: {
1668 Py_UCS2 *start = (Py_UCS2 *)data + index;
1669 Py_UCS2 *ucs2 = start;
1670 assert(index <= PyUnicode_GET_LENGTH(unicode));
1671
1672 for (; *str; ++ucs2, ++str)
1673 *ucs2 = (Py_UCS2)*str;
1674
1675 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1676 return ucs2 - start;
1677 }
1678 default: {
1679 Py_UCS4 *start = (Py_UCS4 *)data + index;
1680 Py_UCS4 *ucs4 = start;
1681 assert(kind == PyUnicode_4BYTE_KIND);
1682 assert(index <= PyUnicode_GET_LENGTH(unicode));
1683
1684 for (; *str; ++ucs4, ++str)
1685 *ucs4 = (Py_UCS4)*str;
1686
1687 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1688 return ucs4 - start;
1689 }
1690 }
1691}
1692
1693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694static PyObject*
1695get_latin1_char(unsigned char ch)
1696{
Victor Stinnera464fc12011-10-02 20:39:30 +02001697 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001699 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 if (!unicode)
1701 return NULL;
1702 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001703 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704 unicode_latin1[ch] = unicode;
1705 }
1706 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001707 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001708}
1709
Alexander Belopolsky40018472011-02-26 01:02:56 +00001710PyObject *
1711PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001713 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714 Py_UCS4 maxchar = 0;
1715 Py_ssize_t num_surrogates;
1716
1717 if (u == NULL)
1718 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001720 /* If the Unicode data is known at construction time, we can apply
1721 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 /* Optimization for empty strings */
1724 if (size == 0 && unicode_empty != NULL) {
1725 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001726 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001727 }
Tim Petersced69f82003-09-16 20:30:58 +00001728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 /* Single character Unicode objects in the Latin-1 range are
1730 shared when using this constructor */
1731 if (size == 1 && *u < 256)
1732 return get_latin1_char((unsigned char)*u);
1733
1734 /* If not empty and not single character, copy the Unicode data
1735 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001736 if (find_maxchar_surrogates(u, u + size,
1737 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 return NULL;
1739
Victor Stinner8faf8212011-12-08 22:14:11 +01001740 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 if (!unicode)
1742 return NULL;
1743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 switch (PyUnicode_KIND(unicode)) {
1745 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001746 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1748 break;
1749 case PyUnicode_2BYTE_KIND:
1750#if Py_UNICODE_SIZE == 2
1751 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1752#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001753 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1755#endif
1756 break;
1757 case PyUnicode_4BYTE_KIND:
1758#if SIZEOF_WCHAR_T == 2
1759 /* This is the only case which has to process surrogates, thus
1760 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001761 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762#else
1763 assert(num_surrogates == 0);
1764 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1765#endif
1766 break;
1767 default:
1768 assert(0 && "Impossible state");
1769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001771 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772}
1773
Alexander Belopolsky40018472011-02-26 01:02:56 +00001774PyObject *
1775PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001776{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001777 if (size < 0) {
1778 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001779 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001780 return NULL;
1781 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001782 if (u != NULL)
1783 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1784 else
1785 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001786}
1787
Alexander Belopolsky40018472011-02-26 01:02:56 +00001788PyObject *
1789PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001790{
1791 size_t size = strlen(u);
1792 if (size > PY_SSIZE_T_MAX) {
1793 PyErr_SetString(PyExc_OverflowError, "input too long");
1794 return NULL;
1795 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001796 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001797}
1798
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001799PyObject *
1800_PyUnicode_FromId(_Py_Identifier *id)
1801{
1802 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001803 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1804 strlen(id->string),
1805 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001806 if (!id->object)
1807 return NULL;
1808 PyUnicode_InternInPlace(&id->object);
1809 assert(!id->next);
1810 id->next = static_strings;
1811 static_strings = id;
1812 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001813 return id->object;
1814}
1815
1816void
1817_PyUnicode_ClearStaticStrings()
1818{
1819 _Py_Identifier *i;
1820 for (i = static_strings; i; i = i->next) {
1821 Py_DECREF(i->object);
1822 i->object = NULL;
1823 i->next = NULL;
1824 }
1825}
1826
Benjamin Peterson0df54292012-03-26 14:50:32 -04001827/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001828
Victor Stinnere57b1c02011-09-28 22:20:48 +02001829static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001830unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001831{
Victor Stinner785938e2011-12-11 20:09:03 +01001832 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001833 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001834#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001835 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001836#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001837 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001838 }
Victor Stinner785938e2011-12-11 20:09:03 +01001839 unicode = PyUnicode_New(size, 127);
1840 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001841 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001842 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1843 assert(_PyUnicode_CheckConsistency(unicode, 1));
1844 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001845}
1846
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001847static Py_UCS4
1848kind_maxchar_limit(unsigned int kind)
1849{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001850 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001851 case PyUnicode_1BYTE_KIND:
1852 return 0x80;
1853 case PyUnicode_2BYTE_KIND:
1854 return 0x100;
1855 case PyUnicode_4BYTE_KIND:
1856 return 0x10000;
1857 default:
1858 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001859 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001860 }
1861}
1862
Victor Stinner702c7342011-10-05 13:50:52 +02001863static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001864_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001865{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001867 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001868
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001869 if (size == 0) {
1870 Py_INCREF(unicode_empty);
1871 return unicode_empty;
1872 }
1873 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001874 if (size == 1)
1875 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001876
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001877 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001878 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 if (!res)
1880 return NULL;
1881 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001882 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001884}
1885
Victor Stinnere57b1c02011-09-28 22:20:48 +02001886static PyObject*
1887_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888{
1889 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001890 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001891
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001892 if (size == 0) {
1893 Py_INCREF(unicode_empty);
1894 return unicode_empty;
1895 }
1896 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001897 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001898 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001899
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001900 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001901 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 if (!res)
1903 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001904 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001906 else {
1907 _PyUnicode_CONVERT_BYTES(
1908 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1909 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001910 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911 return res;
1912}
1913
Victor Stinnere57b1c02011-09-28 22:20:48 +02001914static PyObject*
1915_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916{
1917 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001918 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001919
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001920 if (size == 0) {
1921 Py_INCREF(unicode_empty);
1922 return unicode_empty;
1923 }
1924 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001925 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001926 return get_latin1_char((unsigned char)u[0]);
1927
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001928 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001929 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 if (!res)
1931 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001932 if (max_char < 256)
1933 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1934 PyUnicode_1BYTE_DATA(res));
1935 else if (max_char < 0x10000)
1936 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1937 PyUnicode_2BYTE_DATA(res));
1938 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001940 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941 return res;
1942}
1943
1944PyObject*
1945PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1946{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001947 if (size < 0) {
1948 PyErr_SetString(PyExc_ValueError, "size must be positive");
1949 return NULL;
1950 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001951 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001953 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001955 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001956 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001957 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001958 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001959 PyErr_SetString(PyExc_SystemError, "invalid kind");
1960 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962}
1963
Victor Stinnerece58de2012-04-23 23:36:38 +02001964Py_UCS4
1965_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
1966{
1967 enum PyUnicode_Kind kind;
1968 void *startptr, *endptr;
1969
1970 assert(PyUnicode_IS_READY(unicode));
1971 assert(0 <= start);
1972 assert(end <= PyUnicode_GET_LENGTH(unicode));
1973 assert(start <= end);
1974
1975 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
1976 return PyUnicode_MAX_CHAR_VALUE(unicode);
1977
1978 if (start == end)
1979 return 127;
1980
1981 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04001982 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04001983 endptr = (char *)startptr + end * kind;
1984 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04001985 switch(kind) {
1986 case PyUnicode_1BYTE_KIND:
1987 return ucs1lib_find_max_char(startptr, endptr);
1988 case PyUnicode_2BYTE_KIND:
1989 return ucs2lib_find_max_char(startptr, endptr);
1990 case PyUnicode_4BYTE_KIND:
1991 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02001992 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04001993 assert(0);
1994 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02001995 }
1996}
1997
Victor Stinner25a4b292011-10-06 12:31:55 +02001998/* Ensure that a string uses the most efficient storage, if it is not the
1999 case: create a new string with of the right kind. Write NULL into *p_unicode
2000 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002001static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002002unicode_adjust_maxchar(PyObject **p_unicode)
2003{
2004 PyObject *unicode, *copy;
2005 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002006 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002007 unsigned int kind;
2008
2009 assert(p_unicode != NULL);
2010 unicode = *p_unicode;
2011 assert(PyUnicode_IS_READY(unicode));
2012 if (PyUnicode_IS_ASCII(unicode))
2013 return;
2014
2015 len = PyUnicode_GET_LENGTH(unicode);
2016 kind = PyUnicode_KIND(unicode);
2017 if (kind == PyUnicode_1BYTE_KIND) {
2018 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002019 max_char = ucs1lib_find_max_char(u, u + len);
2020 if (max_char >= 128)
2021 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002022 }
2023 else if (kind == PyUnicode_2BYTE_KIND) {
2024 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002025 max_char = ucs2lib_find_max_char(u, u + len);
2026 if (max_char >= 256)
2027 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002028 }
2029 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002030 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002031 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002032 max_char = ucs4lib_find_max_char(u, u + len);
2033 if (max_char >= 0x10000)
2034 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002035 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002036 copy = PyUnicode_New(len, max_char);
2037 copy_characters(copy, 0, unicode, 0, len);
2038 Py_DECREF(unicode);
2039 *p_unicode = copy;
2040}
2041
Victor Stinner034f6cf2011-09-30 02:26:44 +02002042PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002043_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002044{
Victor Stinner87af4f22011-11-21 23:03:47 +01002045 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002046 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002047
Victor Stinner034f6cf2011-09-30 02:26:44 +02002048 if (!PyUnicode_Check(unicode)) {
2049 PyErr_BadInternalCall();
2050 return NULL;
2051 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002052 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002053 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002054
Victor Stinner87af4f22011-11-21 23:03:47 +01002055 length = PyUnicode_GET_LENGTH(unicode);
2056 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002057 if (!copy)
2058 return NULL;
2059 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2060
Victor Stinner87af4f22011-11-21 23:03:47 +01002061 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2062 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002063 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002064 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002065}
2066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067
Victor Stinnerbc603d12011-10-02 01:00:40 +02002068/* Widen Unicode objects to larger buffers. Don't write terminating null
2069 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070
2071void*
2072_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2073{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002074 Py_ssize_t len;
2075 void *result;
2076 unsigned int skind;
2077
Benjamin Petersonbac79492012-01-14 13:34:47 -05002078 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002079 return NULL;
2080
2081 len = PyUnicode_GET_LENGTH(s);
2082 skind = PyUnicode_KIND(s);
2083 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002084 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 return NULL;
2086 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002087 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002088 case PyUnicode_2BYTE_KIND:
2089 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2090 if (!result)
2091 return PyErr_NoMemory();
2092 assert(skind == PyUnicode_1BYTE_KIND);
2093 _PyUnicode_CONVERT_BYTES(
2094 Py_UCS1, Py_UCS2,
2095 PyUnicode_1BYTE_DATA(s),
2096 PyUnicode_1BYTE_DATA(s) + len,
2097 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002098 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002099 case PyUnicode_4BYTE_KIND:
2100 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2101 if (!result)
2102 return PyErr_NoMemory();
2103 if (skind == PyUnicode_2BYTE_KIND) {
2104 _PyUnicode_CONVERT_BYTES(
2105 Py_UCS2, Py_UCS4,
2106 PyUnicode_2BYTE_DATA(s),
2107 PyUnicode_2BYTE_DATA(s) + len,
2108 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002109 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002110 else {
2111 assert(skind == PyUnicode_1BYTE_KIND);
2112 _PyUnicode_CONVERT_BYTES(
2113 Py_UCS1, Py_UCS4,
2114 PyUnicode_1BYTE_DATA(s),
2115 PyUnicode_1BYTE_DATA(s) + len,
2116 result);
2117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002118 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002119 default:
2120 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002121 }
Victor Stinner01698042011-10-04 00:04:26 +02002122 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 return NULL;
2124}
2125
2126static Py_UCS4*
2127as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2128 int copy_null)
2129{
2130 int kind;
2131 void *data;
2132 Py_ssize_t len, targetlen;
2133 if (PyUnicode_READY(string) == -1)
2134 return NULL;
2135 kind = PyUnicode_KIND(string);
2136 data = PyUnicode_DATA(string);
2137 len = PyUnicode_GET_LENGTH(string);
2138 targetlen = len;
2139 if (copy_null)
2140 targetlen++;
2141 if (!target) {
2142 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2143 PyErr_NoMemory();
2144 return NULL;
2145 }
2146 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2147 if (!target) {
2148 PyErr_NoMemory();
2149 return NULL;
2150 }
2151 }
2152 else {
2153 if (targetsize < targetlen) {
2154 PyErr_Format(PyExc_SystemError,
2155 "string is longer than the buffer");
2156 if (copy_null && 0 < targetsize)
2157 target[0] = 0;
2158 return NULL;
2159 }
2160 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002161 if (kind == PyUnicode_1BYTE_KIND) {
2162 Py_UCS1 *start = (Py_UCS1 *) data;
2163 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002165 else if (kind == PyUnicode_2BYTE_KIND) {
2166 Py_UCS2 *start = (Py_UCS2 *) data;
2167 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2168 }
2169 else {
2170 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 if (copy_null)
2174 target[len] = 0;
2175 return target;
2176}
2177
2178Py_UCS4*
2179PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2180 int copy_null)
2181{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002182 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 PyErr_BadInternalCall();
2184 return NULL;
2185 }
2186 return as_ucs4(string, target, targetsize, copy_null);
2187}
2188
2189Py_UCS4*
2190PyUnicode_AsUCS4Copy(PyObject *string)
2191{
2192 return as_ucs4(string, NULL, 0, 1);
2193}
2194
2195#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002196
Alexander Belopolsky40018472011-02-26 01:02:56 +00002197PyObject *
2198PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002201 if (size == 0) {
2202 Py_INCREF(unicode_empty);
2203 return unicode_empty;
2204 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002205 PyErr_BadInternalCall();
2206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 }
2208
Martin v. Löwis790465f2008-04-05 20:41:37 +00002209 if (size == -1) {
2210 size = wcslen(w);
2211 }
2212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214}
2215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002217
Walter Dörwald346737f2007-05-31 10:44:43 +00002218static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002219makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2220 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002221{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002222 *fmt++ = '%';
2223 if (width) {
2224 if (zeropad)
2225 *fmt++ = '0';
2226 fmt += sprintf(fmt, "%d", width);
2227 }
2228 if (precision)
2229 fmt += sprintf(fmt, ".%d", precision);
2230 if (longflag)
2231 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002232 else if (longlongflag) {
2233 /* longlongflag should only ever be nonzero on machines with
2234 HAVE_LONG_LONG defined */
2235#ifdef HAVE_LONG_LONG
2236 char *f = PY_FORMAT_LONG_LONG;
2237 while (*f)
2238 *fmt++ = *f++;
2239#else
2240 /* we shouldn't ever get here */
2241 assert(0);
2242 *fmt++ = 'l';
2243#endif
2244 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002245 else if (size_tflag) {
2246 char *f = PY_FORMAT_SIZE_T;
2247 while (*f)
2248 *fmt++ = *f++;
2249 }
2250 *fmt++ = c;
2251 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002252}
2253
Victor Stinner96865452011-03-01 23:44:09 +00002254/* helper for PyUnicode_FromFormatV() */
2255
2256static const char*
2257parse_format_flags(const char *f,
2258 int *p_width, int *p_precision,
2259 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2260{
2261 int width, precision, longflag, longlongflag, size_tflag;
2262
2263 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2264 f++;
2265 width = 0;
2266 while (Py_ISDIGIT((unsigned)*f))
2267 width = (width*10) + *f++ - '0';
2268 precision = 0;
2269 if (*f == '.') {
2270 f++;
2271 while (Py_ISDIGIT((unsigned)*f))
2272 precision = (precision*10) + *f++ - '0';
2273 if (*f == '%') {
2274 /* "%.3%s" => f points to "3" */
2275 f--;
2276 }
2277 }
2278 if (*f == '\0') {
2279 /* bogus format "%.1" => go backward, f points to "1" */
2280 f--;
2281 }
2282 if (p_width != NULL)
2283 *p_width = width;
2284 if (p_precision != NULL)
2285 *p_precision = precision;
2286
2287 /* Handle %ld, %lu, %lld and %llu. */
2288 longflag = 0;
2289 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002290 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002291
2292 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002293 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002294 longflag = 1;
2295 ++f;
2296 }
2297#ifdef HAVE_LONG_LONG
2298 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002299 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002300 longlongflag = 1;
2301 f += 2;
2302 }
2303#endif
2304 }
2305 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002306 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002307 size_tflag = 1;
2308 ++f;
2309 }
2310 if (p_longflag != NULL)
2311 *p_longflag = longflag;
2312 if (p_longlongflag != NULL)
2313 *p_longlongflag = longlongflag;
2314 if (p_size_tflag != NULL)
2315 *p_size_tflag = size_tflag;
2316 return f;
2317}
2318
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002319/* maximum number of characters required for output of %ld. 21 characters
2320 allows for 64-bit integers (in decimal) and an optional sign. */
2321#define MAX_LONG_CHARS 21
2322/* maximum number of characters required for output of %lld.
2323 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2324 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2325#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2326
Walter Dörwaldd2034312007-05-18 16:29:38 +00002327PyObject *
2328PyUnicode_FromFormatV(const char *format, va_list vargs)
2329{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002330 va_list count;
2331 Py_ssize_t callcount = 0;
2332 PyObject **callresults = NULL;
2333 PyObject **callresult = NULL;
2334 Py_ssize_t n = 0;
2335 int width = 0;
2336 int precision = 0;
2337 int zeropad;
2338 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002339 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002340 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002341 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2343 Py_UCS4 argmaxchar;
2344 Py_ssize_t numbersize = 0;
2345 char *numberresults = NULL;
2346 char *numberresult = NULL;
2347 Py_ssize_t i;
2348 int kind;
2349 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002350
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002351 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002352 /* step 1: count the number of %S/%R/%A/%s format specifications
2353 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2354 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002355 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002356 * also estimate a upper bound for all the number formats in the string,
2357 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002358 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002359 for (f = format; *f; f++) {
2360 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002361 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002362 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2363 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2364 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2365 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002368#ifdef HAVE_LONG_LONG
2369 if (longlongflag) {
2370 if (width < MAX_LONG_LONG_CHARS)
2371 width = MAX_LONG_LONG_CHARS;
2372 }
2373 else
2374#endif
2375 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2376 including sign. Decimal takes the most space. This
2377 isn't enough for octal. If a width is specified we
2378 need more (which we allocate later). */
2379 if (width < MAX_LONG_CHARS)
2380 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002381
2382 /* account for the size + '\0' to separate numbers
2383 inside of the numberresults buffer */
2384 numbersize += (width + 1);
2385 }
2386 }
2387 else if ((unsigned char)*f > 127) {
2388 PyErr_Format(PyExc_ValueError,
2389 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2390 "string, got a non-ASCII byte: 0x%02x",
2391 (unsigned char)*f);
2392 return NULL;
2393 }
2394 }
2395 /* step 2: allocate memory for the results of
2396 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2397 if (callcount) {
2398 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2399 if (!callresults) {
2400 PyErr_NoMemory();
2401 return NULL;
2402 }
2403 callresult = callresults;
2404 }
2405 /* step 2.5: allocate memory for the results of formating numbers */
2406 if (numbersize) {
2407 numberresults = PyObject_Malloc(numbersize);
2408 if (!numberresults) {
2409 PyErr_NoMemory();
2410 goto fail;
2411 }
2412 numberresult = numberresults;
2413 }
2414
2415 /* step 3: format numbers and figure out how large a buffer we need */
2416 for (f = format; *f; f++) {
2417 if (*f == '%') {
2418 const char* p;
2419 int longflag;
2420 int longlongflag;
2421 int size_tflag;
2422 int numprinted;
2423
2424 p = f;
2425 zeropad = (f[1] == '0');
2426 f = parse_format_flags(f, &width, &precision,
2427 &longflag, &longlongflag, &size_tflag);
2428 switch (*f) {
2429 case 'c':
2430 {
2431 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002432 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 n++;
2434 break;
2435 }
2436 case '%':
2437 n++;
2438 break;
2439 case 'i':
2440 case 'd':
2441 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2442 width, precision, *f);
2443 if (longflag)
2444 numprinted = sprintf(numberresult, fmt,
2445 va_arg(count, long));
2446#ifdef HAVE_LONG_LONG
2447 else if (longlongflag)
2448 numprinted = sprintf(numberresult, fmt,
2449 va_arg(count, PY_LONG_LONG));
2450#endif
2451 else if (size_tflag)
2452 numprinted = sprintf(numberresult, fmt,
2453 va_arg(count, Py_ssize_t));
2454 else
2455 numprinted = sprintf(numberresult, fmt,
2456 va_arg(count, int));
2457 n += numprinted;
2458 /* advance by +1 to skip over the '\0' */
2459 numberresult += (numprinted + 1);
2460 assert(*(numberresult - 1) == '\0');
2461 assert(*(numberresult - 2) != '\0');
2462 assert(numprinted >= 0);
2463 assert(numberresult <= numberresults + numbersize);
2464 break;
2465 case 'u':
2466 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2467 width, precision, 'u');
2468 if (longflag)
2469 numprinted = sprintf(numberresult, fmt,
2470 va_arg(count, unsigned long));
2471#ifdef HAVE_LONG_LONG
2472 else if (longlongflag)
2473 numprinted = sprintf(numberresult, fmt,
2474 va_arg(count, unsigned PY_LONG_LONG));
2475#endif
2476 else if (size_tflag)
2477 numprinted = sprintf(numberresult, fmt,
2478 va_arg(count, size_t));
2479 else
2480 numprinted = sprintf(numberresult, fmt,
2481 va_arg(count, unsigned int));
2482 n += numprinted;
2483 numberresult += (numprinted + 1);
2484 assert(*(numberresult - 1) == '\0');
2485 assert(*(numberresult - 2) != '\0');
2486 assert(numprinted >= 0);
2487 assert(numberresult <= numberresults + numbersize);
2488 break;
2489 case 'x':
2490 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2491 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2492 n += numprinted;
2493 numberresult += (numprinted + 1);
2494 assert(*(numberresult - 1) == '\0');
2495 assert(*(numberresult - 2) != '\0');
2496 assert(numprinted >= 0);
2497 assert(numberresult <= numberresults + numbersize);
2498 break;
2499 case 'p':
2500 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2501 /* %p is ill-defined: ensure leading 0x. */
2502 if (numberresult[1] == 'X')
2503 numberresult[1] = 'x';
2504 else if (numberresult[1] != 'x') {
2505 memmove(numberresult + 2, numberresult,
2506 strlen(numberresult) + 1);
2507 numberresult[0] = '0';
2508 numberresult[1] = 'x';
2509 numprinted += 2;
2510 }
2511 n += numprinted;
2512 numberresult += (numprinted + 1);
2513 assert(*(numberresult - 1) == '\0');
2514 assert(*(numberresult - 2) != '\0');
2515 assert(numprinted >= 0);
2516 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002517 break;
2518 case 's':
2519 {
2520 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002521 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002522 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002523 if (!str)
2524 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 /* since PyUnicode_DecodeUTF8 returns already flexible
2526 unicode objects, there is no need to call ready on them */
2527 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002528 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002529 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002530 /* Remember the str and switch to the next slot */
2531 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002532 break;
2533 }
2534 case 'U':
2535 {
2536 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002537 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002538 if (PyUnicode_READY(obj) == -1)
2539 goto fail;
2540 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002541 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002542 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002543 break;
2544 }
2545 case 'V':
2546 {
2547 PyObject *obj = va_arg(count, PyObject *);
2548 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002549 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002550 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002551 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002552 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 if (PyUnicode_READY(obj) == -1)
2554 goto fail;
2555 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002556 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002558 *callresult++ = NULL;
2559 }
2560 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002561 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002562 if (!str_obj)
2563 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002564 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002565 Py_DECREF(str_obj);
2566 goto fail;
2567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002569 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002571 *callresult++ = str_obj;
2572 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002573 break;
2574 }
2575 case 'S':
2576 {
2577 PyObject *obj = va_arg(count, PyObject *);
2578 PyObject *str;
2579 assert(obj);
2580 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002581 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002583 if (PyUnicode_READY(str) == -1) {
2584 Py_DECREF(str);
2585 goto fail;
2586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002588 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002590 /* Remember the str and switch to the next slot */
2591 *callresult++ = str;
2592 break;
2593 }
2594 case 'R':
2595 {
2596 PyObject *obj = va_arg(count, PyObject *);
2597 PyObject *repr;
2598 assert(obj);
2599 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002600 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002601 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002602 if (PyUnicode_READY(repr) == -1) {
2603 Py_DECREF(repr);
2604 goto fail;
2605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002607 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 /* Remember the repr and switch to the next slot */
2610 *callresult++ = repr;
2611 break;
2612 }
2613 case 'A':
2614 {
2615 PyObject *obj = va_arg(count, PyObject *);
2616 PyObject *ascii;
2617 assert(obj);
2618 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002619 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002620 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002621 if (PyUnicode_READY(ascii) == -1) {
2622 Py_DECREF(ascii);
2623 goto fail;
2624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002626 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002627 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 /* Remember the repr and switch to the next slot */
2629 *callresult++ = ascii;
2630 break;
2631 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 default:
2633 /* if we stumble upon an unknown
2634 formatting code, copy the rest of
2635 the format string to the output
2636 string. (we cannot just skip the
2637 code, since there's no way to know
2638 what's in the argument list) */
2639 n += strlen(p);
2640 goto expand;
2641 }
2642 } else
2643 n++;
2644 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002645 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002647 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 we don't have to resize the string.
2649 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002650 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002651 if (!string)
2652 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002653 kind = PyUnicode_KIND(string);
2654 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002660 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002661
2662 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2664 /* checking for == because the last argument could be a empty
2665 string, which causes i to point to end, the assert at the end of
2666 the loop */
2667 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002668
Benjamin Peterson14339b62009-01-31 16:36:08 +00002669 switch (*f) {
2670 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002671 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 const int ordinal = va_arg(vargs, int);
2673 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002675 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002676 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002677 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002679 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002681 {
2682 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683 /* unused, since we already have the result */
2684 if (*f == 'p')
2685 (void) va_arg(vargs, void *);
2686 else
2687 (void) va_arg(vargs, int);
2688 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002689 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002691 i += written;
2692 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002693 assert(*numberresult == '\0');
2694 numberresult++;
2695 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002697 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 case 's':
2699 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002700 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002702 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 size = PyUnicode_GET_LENGTH(*callresult);
2704 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002705 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002706 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002707 /* We're done with the unicode()/repr() => forget it */
2708 Py_DECREF(*callresult);
2709 /* switch to next unicode()/repr() result */
2710 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002711 break;
2712 }
2713 case 'U':
2714 {
2715 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716 Py_ssize_t size;
2717 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2718 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002719 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002720 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002721 break;
2722 }
2723 case 'V':
2724 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002725 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002726 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002727 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002728 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 size = PyUnicode_GET_LENGTH(obj);
2730 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002731 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002733 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002734 size = PyUnicode_GET_LENGTH(*callresult);
2735 assert(PyUnicode_KIND(*callresult) <=
2736 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002737 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002739 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002740 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002741 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002742 break;
2743 }
2744 case 'S':
2745 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002746 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002747 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002748 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 /* unused, since we already have the result */
2750 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002752 copy_characters(string, i, *callresult, 0, size);
2753 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002754 /* We're done with the unicode()/repr() => forget it */
2755 Py_DECREF(*callresult);
2756 /* switch to next unicode()/repr() result */
2757 ++callresult;
2758 break;
2759 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002760 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002762 break;
2763 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002764 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002766 goto end;
2767 }
Victor Stinner1205f272010-09-11 00:54:47 +00002768 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769 else {
2770 assert(i < PyUnicode_GET_LENGTH(string));
2771 PyUnicode_WRITE(kind, data, i++, *f);
2772 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002773 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002774 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775
Benjamin Peterson29060642009-01-31 22:14:21 +00002776 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002777 if (callresults)
2778 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 if (numberresults)
2780 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002781 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002782 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002783 if (callresults) {
2784 PyObject **callresult2 = callresults;
2785 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002786 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002787 ++callresult2;
2788 }
2789 PyObject_Free(callresults);
2790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791 if (numberresults)
2792 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002793 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002794}
2795
Walter Dörwaldd2034312007-05-18 16:29:38 +00002796PyObject *
2797PyUnicode_FromFormat(const char *format, ...)
2798{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002799 PyObject* ret;
2800 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002801
2802#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002803 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002804#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002805 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002806#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002807 ret = PyUnicode_FromFormatV(format, vargs);
2808 va_end(vargs);
2809 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002810}
2811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812#ifdef HAVE_WCHAR_H
2813
Victor Stinner5593d8a2010-10-02 11:11:27 +00002814/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2815 convert a Unicode object to a wide character string.
2816
Victor Stinnerd88d9832011-09-06 02:00:05 +02002817 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002818 character) required to convert the unicode object. Ignore size argument.
2819
Victor Stinnerd88d9832011-09-06 02:00:05 +02002820 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002821 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002822 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002823static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002824unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002825 wchar_t *w,
2826 Py_ssize_t size)
2827{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002828 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002829 const wchar_t *wstr;
2830
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002831 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002832 if (wstr == NULL)
2833 return -1;
2834
Victor Stinner5593d8a2010-10-02 11:11:27 +00002835 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002836 if (size > res)
2837 size = res + 1;
2838 else
2839 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002840 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002841 return res;
2842 }
2843 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002844 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002845}
2846
2847Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002848PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002849 wchar_t *w,
2850 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851{
2852 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 PyErr_BadInternalCall();
2854 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002856 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857}
2858
Victor Stinner137c34c2010-09-29 10:25:54 +00002859wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002860PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002861 Py_ssize_t *size)
2862{
2863 wchar_t* buffer;
2864 Py_ssize_t buflen;
2865
2866 if (unicode == NULL) {
2867 PyErr_BadInternalCall();
2868 return NULL;
2869 }
2870
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002871 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002872 if (buflen == -1)
2873 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002874 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002875 PyErr_NoMemory();
2876 return NULL;
2877 }
2878
Victor Stinner137c34c2010-09-29 10:25:54 +00002879 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2880 if (buffer == NULL) {
2881 PyErr_NoMemory();
2882 return NULL;
2883 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002884 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002885 if (buflen == -1)
2886 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002887 if (size != NULL)
2888 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002889 return buffer;
2890}
2891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002892#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893
Alexander Belopolsky40018472011-02-26 01:02:56 +00002894PyObject *
2895PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002896{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002897 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002898 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002899 PyErr_SetString(PyExc_ValueError,
2900 "chr() arg not in range(0x110000)");
2901 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002902 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002904 if (ordinal < 256)
2905 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002907 v = PyUnicode_New(1, ordinal);
2908 if (v == NULL)
2909 return NULL;
2910 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002911 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002912 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002913}
2914
Alexander Belopolsky40018472011-02-26 01:02:56 +00002915PyObject *
2916PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002918 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002920 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002921 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002922 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 Py_INCREF(obj);
2924 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002925 }
2926 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002927 /* For a Unicode subtype that's not a Unicode object,
2928 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002929 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002930 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002931 PyErr_Format(PyExc_TypeError,
2932 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002933 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002934 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002935}
2936
Alexander Belopolsky40018472011-02-26 01:02:56 +00002937PyObject *
2938PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002939 const char *encoding,
2940 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002941{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002942 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002943 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002944
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002946 PyErr_BadInternalCall();
2947 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002949
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002950 /* Decoding bytes objects is the most common case and should be fast */
2951 if (PyBytes_Check(obj)) {
2952 if (PyBytes_GET_SIZE(obj) == 0) {
2953 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002954 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002955 }
2956 else {
2957 v = PyUnicode_Decode(
2958 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2959 encoding, errors);
2960 }
2961 return v;
2962 }
2963
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002964 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002965 PyErr_SetString(PyExc_TypeError,
2966 "decoding str is not supported");
2967 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002968 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002969
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002970 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2971 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2972 PyErr_Format(PyExc_TypeError,
2973 "coercing to str: need bytes, bytearray "
2974 "or buffer-like object, %.80s found",
2975 Py_TYPE(obj)->tp_name);
2976 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002977 }
Tim Petersced69f82003-09-16 20:30:58 +00002978
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002979 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002980 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002981 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 }
Tim Petersced69f82003-09-16 20:30:58 +00002983 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002984 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002985
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002986 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002987 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988}
2989
Victor Stinner600d3be2010-06-10 12:00:55 +00002990/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002991 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2992 1 on success. */
2993static int
2994normalize_encoding(const char *encoding,
2995 char *lower,
2996 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002998 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002999 char *l;
3000 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003001
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003002 if (encoding == NULL) {
3003 strcpy(lower, "utf-8");
3004 return 1;
3005 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003006 e = encoding;
3007 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003008 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003009 while (*e) {
3010 if (l == l_end)
3011 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003012 if (Py_ISUPPER(*e)) {
3013 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003014 }
3015 else if (*e == '_') {
3016 *l++ = '-';
3017 e++;
3018 }
3019 else {
3020 *l++ = *e++;
3021 }
3022 }
3023 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003024 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003025}
3026
Alexander Belopolsky40018472011-02-26 01:02:56 +00003027PyObject *
3028PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003029 Py_ssize_t size,
3030 const char *encoding,
3031 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003032{
3033 PyObject *buffer = NULL, *unicode;
3034 Py_buffer info;
3035 char lower[11]; /* Enough for any encoding shortcut */
3036
Fred Drakee4315f52000-05-09 19:53:39 +00003037 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003038 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003039 if ((strcmp(lower, "utf-8") == 0) ||
3040 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003041 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003042 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003043 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003044 (strcmp(lower, "iso-8859-1") == 0))
3045 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003046#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003047 else if (strcmp(lower, "mbcs") == 0)
3048 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003049#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003050 else if (strcmp(lower, "ascii") == 0)
3051 return PyUnicode_DecodeASCII(s, size, errors);
3052 else if (strcmp(lower, "utf-16") == 0)
3053 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3054 else if (strcmp(lower, "utf-32") == 0)
3055 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057
3058 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003059 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003060 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003061 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003062 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 if (buffer == NULL)
3064 goto onError;
3065 unicode = PyCodec_Decode(buffer, encoding, errors);
3066 if (unicode == NULL)
3067 goto onError;
3068 if (!PyUnicode_Check(unicode)) {
3069 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003070 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003071 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 Py_DECREF(unicode);
3073 goto onError;
3074 }
3075 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003076 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003077
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 Py_XDECREF(buffer);
3080 return NULL;
3081}
3082
Alexander Belopolsky40018472011-02-26 01:02:56 +00003083PyObject *
3084PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003085 const char *encoding,
3086 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003087{
3088 PyObject *v;
3089
3090 if (!PyUnicode_Check(unicode)) {
3091 PyErr_BadArgument();
3092 goto onError;
3093 }
3094
3095 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003096 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003097
3098 /* Decode via the codec registry */
3099 v = PyCodec_Decode(unicode, encoding, errors);
3100 if (v == NULL)
3101 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003102 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003103
Benjamin Peterson29060642009-01-31 22:14:21 +00003104 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003105 return NULL;
3106}
3107
Alexander Belopolsky40018472011-02-26 01:02:56 +00003108PyObject *
3109PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003110 const char *encoding,
3111 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003112{
3113 PyObject *v;
3114
3115 if (!PyUnicode_Check(unicode)) {
3116 PyErr_BadArgument();
3117 goto onError;
3118 }
3119
3120 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003121 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003122
3123 /* Decode via the codec registry */
3124 v = PyCodec_Decode(unicode, encoding, errors);
3125 if (v == NULL)
3126 goto onError;
3127 if (!PyUnicode_Check(v)) {
3128 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003129 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003130 Py_TYPE(v)->tp_name);
3131 Py_DECREF(v);
3132 goto onError;
3133 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003134 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003135
Benjamin Peterson29060642009-01-31 22:14:21 +00003136 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003137 return NULL;
3138}
3139
Alexander Belopolsky40018472011-02-26 01:02:56 +00003140PyObject *
3141PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003142 Py_ssize_t size,
3143 const char *encoding,
3144 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145{
3146 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003147
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148 unicode = PyUnicode_FromUnicode(s, size);
3149 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003150 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3152 Py_DECREF(unicode);
3153 return v;
3154}
3155
Alexander Belopolsky40018472011-02-26 01:02:56 +00003156PyObject *
3157PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003158 const char *encoding,
3159 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003160{
3161 PyObject *v;
3162
3163 if (!PyUnicode_Check(unicode)) {
3164 PyErr_BadArgument();
3165 goto onError;
3166 }
3167
3168 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003169 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003170
3171 /* Encode via the codec registry */
3172 v = PyCodec_Encode(unicode, encoding, errors);
3173 if (v == NULL)
3174 goto onError;
3175 return v;
3176
Benjamin Peterson29060642009-01-31 22:14:21 +00003177 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003178 return NULL;
3179}
3180
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003181static size_t
3182wcstombs_errorpos(const wchar_t *wstr)
3183{
3184 size_t len;
3185#if SIZEOF_WCHAR_T == 2
3186 wchar_t buf[3];
3187#else
3188 wchar_t buf[2];
3189#endif
3190 char outbuf[MB_LEN_MAX];
3191 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003192
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003193#if SIZEOF_WCHAR_T == 2
3194 buf[2] = 0;
3195#else
3196 buf[1] = 0;
3197#endif
3198 start = wstr;
3199 while (*wstr != L'\0')
3200 {
3201 previous = wstr;
3202#if SIZEOF_WCHAR_T == 2
3203 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3204 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3205 {
3206 buf[0] = wstr[0];
3207 buf[1] = wstr[1];
3208 wstr += 2;
3209 }
3210 else {
3211 buf[0] = *wstr;
3212 buf[1] = 0;
3213 wstr++;
3214 }
3215#else
3216 buf[0] = *wstr;
3217 wstr++;
3218#endif
3219 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003220 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003221 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003222 }
3223
3224 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003225 return 0;
3226}
3227
Victor Stinner1b579672011-12-17 05:47:23 +01003228static int
3229locale_error_handler(const char *errors, int *surrogateescape)
3230{
3231 if (errors == NULL) {
3232 *surrogateescape = 0;
3233 return 0;
3234 }
3235
3236 if (strcmp(errors, "strict") == 0) {
3237 *surrogateescape = 0;
3238 return 0;
3239 }
3240 if (strcmp(errors, "surrogateescape") == 0) {
3241 *surrogateescape = 1;
3242 return 0;
3243 }
3244 PyErr_Format(PyExc_ValueError,
3245 "only 'strict' and 'surrogateescape' error handlers "
3246 "are supported, not '%s'",
3247 errors);
3248 return -1;
3249}
3250
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003251PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003252PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003253{
3254 Py_ssize_t wlen, wlen2;
3255 wchar_t *wstr;
3256 PyObject *bytes = NULL;
3257 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003258 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003259 PyObject *exc;
3260 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003261 int surrogateescape;
3262
3263 if (locale_error_handler(errors, &surrogateescape) < 0)
3264 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003265
3266 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3267 if (wstr == NULL)
3268 return NULL;
3269
3270 wlen2 = wcslen(wstr);
3271 if (wlen2 != wlen) {
3272 PyMem_Free(wstr);
3273 PyErr_SetString(PyExc_TypeError, "embedded null character");
3274 return NULL;
3275 }
3276
3277 if (surrogateescape) {
3278 /* locale encoding with surrogateescape */
3279 char *str;
3280
3281 str = _Py_wchar2char(wstr, &error_pos);
3282 if (str == NULL) {
3283 if (error_pos == (size_t)-1) {
3284 PyErr_NoMemory();
3285 PyMem_Free(wstr);
3286 return NULL;
3287 }
3288 else {
3289 goto encode_error;
3290 }
3291 }
3292 PyMem_Free(wstr);
3293
3294 bytes = PyBytes_FromString(str);
3295 PyMem_Free(str);
3296 }
3297 else {
3298 size_t len, len2;
3299
3300 len = wcstombs(NULL, wstr, 0);
3301 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003302 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003303 goto encode_error;
3304 }
3305
3306 bytes = PyBytes_FromStringAndSize(NULL, len);
3307 if (bytes == NULL) {
3308 PyMem_Free(wstr);
3309 return NULL;
3310 }
3311
3312 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3313 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003314 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003315 goto encode_error;
3316 }
3317 PyMem_Free(wstr);
3318 }
3319 return bytes;
3320
3321encode_error:
3322 errmsg = strerror(errno);
3323 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003324
3325 if (error_pos == (size_t)-1)
3326 error_pos = wcstombs_errorpos(wstr);
3327
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003328 PyMem_Free(wstr);
3329 Py_XDECREF(bytes);
3330
Victor Stinner2f197072011-12-17 07:08:30 +01003331 if (errmsg != NULL) {
3332 size_t errlen;
3333 wstr = _Py_char2wchar(errmsg, &errlen);
3334 if (wstr != NULL) {
3335 reason = PyUnicode_FromWideChar(wstr, errlen);
3336 PyMem_Free(wstr);
3337 } else
3338 errmsg = NULL;
3339 }
3340 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003341 reason = PyUnicode_FromString(
3342 "wcstombs() encountered an unencodable "
3343 "wide character");
3344 if (reason == NULL)
3345 return NULL;
3346
3347 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3348 "locale", unicode,
3349 (Py_ssize_t)error_pos,
3350 (Py_ssize_t)(error_pos+1),
3351 reason);
3352 Py_DECREF(reason);
3353 if (exc != NULL) {
3354 PyCodec_StrictErrors(exc);
3355 Py_XDECREF(exc);
3356 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003357 return NULL;
3358}
3359
Victor Stinnerad158722010-10-27 00:25:46 +00003360PyObject *
3361PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003362{
Victor Stinner99b95382011-07-04 14:23:54 +02003363#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003364 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003365#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003366 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003367#else
Victor Stinner793b5312011-04-27 00:24:21 +02003368 PyInterpreterState *interp = PyThreadState_GET()->interp;
3369 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3370 cannot use it to encode and decode filenames before it is loaded. Load
3371 the Python codec requires to encode at least its own filename. Use the C
3372 version of the locale codec until the codec registry is initialized and
3373 the Python codec is loaded.
3374
3375 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3376 cannot only rely on it: check also interp->fscodec_initialized for
3377 subinterpreters. */
3378 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003379 return PyUnicode_AsEncodedString(unicode,
3380 Py_FileSystemDefaultEncoding,
3381 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003382 }
3383 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003384 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003385 }
Victor Stinnerad158722010-10-27 00:25:46 +00003386#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003387}
3388
Alexander Belopolsky40018472011-02-26 01:02:56 +00003389PyObject *
3390PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003391 const char *encoding,
3392 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393{
3394 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003395 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003396
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 if (!PyUnicode_Check(unicode)) {
3398 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400 }
Fred Drakee4315f52000-05-09 19:53:39 +00003401
Fred Drakee4315f52000-05-09 19:53:39 +00003402 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003403 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003404 if ((strcmp(lower, "utf-8") == 0) ||
3405 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003406 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003407 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003408 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003409 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003410 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003411 }
Victor Stinner37296e82010-06-10 13:36:23 +00003412 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003413 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003414 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003416#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003417 else if (strcmp(lower, "mbcs") == 0)
3418 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003419#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003420 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003421 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423
3424 /* Encode via the codec registry */
3425 v = PyCodec_Encode(unicode, encoding, errors);
3426 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003427 return NULL;
3428
3429 /* The normal path */
3430 if (PyBytes_Check(v))
3431 return v;
3432
3433 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003434 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003435 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003436 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003437
3438 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3439 "encoder %s returned bytearray instead of bytes",
3440 encoding);
3441 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003442 Py_DECREF(v);
3443 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003444 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003445
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003446 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3447 Py_DECREF(v);
3448 return b;
3449 }
3450
3451 PyErr_Format(PyExc_TypeError,
3452 "encoder did not return a bytes object (type=%.400s)",
3453 Py_TYPE(v)->tp_name);
3454 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003455 return NULL;
3456}
3457
Alexander Belopolsky40018472011-02-26 01:02:56 +00003458PyObject *
3459PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003460 const char *encoding,
3461 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003462{
3463 PyObject *v;
3464
3465 if (!PyUnicode_Check(unicode)) {
3466 PyErr_BadArgument();
3467 goto onError;
3468 }
3469
3470 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003471 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003472
3473 /* Encode via the codec registry */
3474 v = PyCodec_Encode(unicode, encoding, errors);
3475 if (v == NULL)
3476 goto onError;
3477 if (!PyUnicode_Check(v)) {
3478 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003479 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003480 Py_TYPE(v)->tp_name);
3481 Py_DECREF(v);
3482 goto onError;
3483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003485
Benjamin Peterson29060642009-01-31 22:14:21 +00003486 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 return NULL;
3488}
3489
Victor Stinner2f197072011-12-17 07:08:30 +01003490static size_t
3491mbstowcs_errorpos(const char *str, size_t len)
3492{
3493#ifdef HAVE_MBRTOWC
3494 const char *start = str;
3495 mbstate_t mbs;
3496 size_t converted;
3497 wchar_t ch;
3498
3499 memset(&mbs, 0, sizeof mbs);
3500 while (len)
3501 {
3502 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3503 if (converted == 0)
3504 /* Reached end of string */
3505 break;
3506 if (converted == (size_t)-1 || converted == (size_t)-2) {
3507 /* Conversion error or incomplete character */
3508 return str - start;
3509 }
3510 else {
3511 str += converted;
3512 len -= converted;
3513 }
3514 }
3515 /* failed to find the undecodable byte sequence */
3516 return 0;
3517#endif
3518 return 0;
3519}
3520
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003521PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003522PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003523 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003524{
3525 wchar_t smallbuf[256];
3526 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3527 wchar_t *wstr;
3528 size_t wlen, wlen2;
3529 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003530 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003531 size_t error_pos;
3532 char *errmsg;
3533 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003534
3535 if (locale_error_handler(errors, &surrogateescape) < 0)
3536 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003537
3538 if (str[len] != '\0' || len != strlen(str)) {
3539 PyErr_SetString(PyExc_TypeError, "embedded null character");
3540 return NULL;
3541 }
3542
3543 if (surrogateescape)
3544 {
3545 wstr = _Py_char2wchar(str, &wlen);
3546 if (wstr == NULL) {
3547 if (wlen == (size_t)-1)
3548 PyErr_NoMemory();
3549 else
3550 PyErr_SetFromErrno(PyExc_OSError);
3551 return NULL;
3552 }
3553
3554 unicode = PyUnicode_FromWideChar(wstr, wlen);
3555 PyMem_Free(wstr);
3556 }
3557 else {
3558#ifndef HAVE_BROKEN_MBSTOWCS
3559 wlen = mbstowcs(NULL, str, 0);
3560#else
3561 wlen = len;
3562#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003563 if (wlen == (size_t)-1)
3564 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003565 if (wlen+1 <= smallbuf_len) {
3566 wstr = smallbuf;
3567 }
3568 else {
3569 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3570 return PyErr_NoMemory();
3571
3572 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3573 if (!wstr)
3574 return PyErr_NoMemory();
3575 }
3576
3577 /* This shouldn't fail now */
3578 wlen2 = mbstowcs(wstr, str, wlen+1);
3579 if (wlen2 == (size_t)-1) {
3580 if (wstr != smallbuf)
3581 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003582 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003583 }
3584#ifdef HAVE_BROKEN_MBSTOWCS
3585 assert(wlen2 == wlen);
3586#endif
3587 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3588 if (wstr != smallbuf)
3589 PyMem_Free(wstr);
3590 }
3591 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003592
3593decode_error:
3594 errmsg = strerror(errno);
3595 assert(errmsg != NULL);
3596
3597 error_pos = mbstowcs_errorpos(str, len);
3598 if (errmsg != NULL) {
3599 size_t errlen;
3600 wstr = _Py_char2wchar(errmsg, &errlen);
3601 if (wstr != NULL) {
3602 reason = PyUnicode_FromWideChar(wstr, errlen);
3603 PyMem_Free(wstr);
3604 } else
3605 errmsg = NULL;
3606 }
3607 if (errmsg == NULL)
3608 reason = PyUnicode_FromString(
3609 "mbstowcs() encountered an invalid multibyte sequence");
3610 if (reason == NULL)
3611 return NULL;
3612
3613 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3614 "locale", str, len,
3615 (Py_ssize_t)error_pos,
3616 (Py_ssize_t)(error_pos+1),
3617 reason);
3618 Py_DECREF(reason);
3619 if (exc != NULL) {
3620 PyCodec_StrictErrors(exc);
3621 Py_XDECREF(exc);
3622 }
3623 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003624}
3625
3626PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003627PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003628{
3629 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003630 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003631}
3632
3633
3634PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003635PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003636 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003637 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3638}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003639
Christian Heimes5894ba72007-11-04 11:43:14 +00003640PyObject*
3641PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3642{
Victor Stinner99b95382011-07-04 14:23:54 +02003643#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003644 return PyUnicode_DecodeMBCS(s, size, NULL);
3645#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003646 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003647#else
Victor Stinner793b5312011-04-27 00:24:21 +02003648 PyInterpreterState *interp = PyThreadState_GET()->interp;
3649 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3650 cannot use it to encode and decode filenames before it is loaded. Load
3651 the Python codec requires to encode at least its own filename. Use the C
3652 version of the locale codec until the codec registry is initialized and
3653 the Python codec is loaded.
3654
3655 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3656 cannot only rely on it: check also interp->fscodec_initialized for
3657 subinterpreters. */
3658 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003659 return PyUnicode_Decode(s, size,
3660 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003661 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003662 }
3663 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003664 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003665 }
Victor Stinnerad158722010-10-27 00:25:46 +00003666#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003667}
3668
Martin v. Löwis011e8422009-05-05 04:43:17 +00003669
3670int
Antoine Pitrou13348842012-01-29 18:36:34 +01003671_PyUnicode_HasNULChars(PyObject* s)
3672{
3673 static PyObject *nul = NULL;
3674
3675 if (nul == NULL)
3676 nul = PyUnicode_FromStringAndSize("\0", 1);
3677 if (nul == NULL)
3678 return -1;
3679 return PyUnicode_Contains(s, nul);
3680}
3681
3682
3683int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003684PyUnicode_FSConverter(PyObject* arg, void* addr)
3685{
3686 PyObject *output = NULL;
3687 Py_ssize_t size;
3688 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003689 if (arg == NULL) {
3690 Py_DECREF(*(PyObject**)addr);
3691 return 1;
3692 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003693 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003694 output = arg;
3695 Py_INCREF(output);
3696 }
3697 else {
3698 arg = PyUnicode_FromObject(arg);
3699 if (!arg)
3700 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003701 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003702 Py_DECREF(arg);
3703 if (!output)
3704 return 0;
3705 if (!PyBytes_Check(output)) {
3706 Py_DECREF(output);
3707 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3708 return 0;
3709 }
3710 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003711 size = PyBytes_GET_SIZE(output);
3712 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003713 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003714 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003715 Py_DECREF(output);
3716 return 0;
3717 }
3718 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003719 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003720}
3721
3722
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003723int
3724PyUnicode_FSDecoder(PyObject* arg, void* addr)
3725{
3726 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003727 if (arg == NULL) {
3728 Py_DECREF(*(PyObject**)addr);
3729 return 1;
3730 }
3731 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003732 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003733 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003734 output = arg;
3735 Py_INCREF(output);
3736 }
3737 else {
3738 arg = PyBytes_FromObject(arg);
3739 if (!arg)
3740 return 0;
3741 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3742 PyBytes_GET_SIZE(arg));
3743 Py_DECREF(arg);
3744 if (!output)
3745 return 0;
3746 if (!PyUnicode_Check(output)) {
3747 Py_DECREF(output);
3748 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3749 return 0;
3750 }
3751 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003752 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003753 Py_DECREF(output);
3754 return 0;
3755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003757 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003758 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3759 Py_DECREF(output);
3760 return 0;
3761 }
3762 *(PyObject**)addr = output;
3763 return Py_CLEANUP_SUPPORTED;
3764}
3765
3766
Martin v. Löwis5b222132007-06-10 09:51:05 +00003767char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003768PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003769{
Christian Heimesf3863112007-11-22 07:46:41 +00003770 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003772 if (!PyUnicode_Check(unicode)) {
3773 PyErr_BadArgument();
3774 return NULL;
3775 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003776 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003777 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003779 if (PyUnicode_UTF8(unicode) == NULL) {
3780 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3782 if (bytes == NULL)
3783 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003784 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3785 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003786 Py_DECREF(bytes);
3787 return NULL;
3788 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003789 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3790 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3791 PyBytes_AS_STRING(bytes),
3792 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 Py_DECREF(bytes);
3794 }
3795
3796 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003797 *psize = PyUnicode_UTF8_LENGTH(unicode);
3798 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003799}
3800
3801char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003802PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003804 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3805}
3806
3807#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003808static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809#endif
3810
3811
3812Py_UNICODE *
3813PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3814{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815 const unsigned char *one_byte;
3816#if SIZEOF_WCHAR_T == 4
3817 const Py_UCS2 *two_bytes;
3818#else
3819 const Py_UCS4 *four_bytes;
3820 const Py_UCS4 *ucs4_end;
3821 Py_ssize_t num_surrogates;
3822#endif
3823 wchar_t *w;
3824 wchar_t *wchar_end;
3825
3826 if (!PyUnicode_Check(unicode)) {
3827 PyErr_BadArgument();
3828 return NULL;
3829 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003830 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003831 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 assert(_PyUnicode_KIND(unicode) != 0);
3833 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834
3835#ifdef Py_DEBUG
3836 ++unicode_as_unicode_calls;
3837#endif
3838
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003839 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003841 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3842 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843 num_surrogates = 0;
3844
3845 for (; four_bytes < ucs4_end; ++four_bytes) {
3846 if (*four_bytes > 0xFFFF)
3847 ++num_surrogates;
3848 }
3849
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003850 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3851 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3852 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 PyErr_NoMemory();
3854 return NULL;
3855 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003856 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003858 w = _PyUnicode_WSTR(unicode);
3859 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3860 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3862 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003863 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003865 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3866 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867 }
3868 else
3869 *w = *four_bytes;
3870
3871 if (w > wchar_end) {
3872 assert(0 && "Miscalculated string end");
3873 }
3874 }
3875 *w = 0;
3876#else
3877 /* sizeof(wchar_t) == 4 */
3878 Py_FatalError("Impossible unicode object state, wstr and str "
3879 "should share memory already.");
3880 return NULL;
3881#endif
3882 }
3883 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003884 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3885 (_PyUnicode_LENGTH(unicode) + 1));
3886 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887 PyErr_NoMemory();
3888 return NULL;
3889 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3891 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3892 w = _PyUnicode_WSTR(unicode);
3893 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003895 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3896 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897 for (; w < wchar_end; ++one_byte, ++w)
3898 *w = *one_byte;
3899 /* null-terminate the wstr */
3900 *w = 0;
3901 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003902 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003904 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905 for (; w < wchar_end; ++two_bytes, ++w)
3906 *w = *two_bytes;
3907 /* null-terminate the wstr */
3908 *w = 0;
3909#else
3910 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003911 PyObject_FREE(_PyUnicode_WSTR(unicode));
3912 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003913 Py_FatalError("Impossible unicode object state, wstr "
3914 "and str should share memory already.");
3915 return NULL;
3916#endif
3917 }
3918 else {
3919 assert(0 && "This should never happen.");
3920 }
3921 }
3922 }
3923 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003924 *size = PyUnicode_WSTR_LENGTH(unicode);
3925 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003926}
3927
Alexander Belopolsky40018472011-02-26 01:02:56 +00003928Py_UNICODE *
3929PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932}
3933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934
Alexander Belopolsky40018472011-02-26 01:02:56 +00003935Py_ssize_t
3936PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937{
3938 if (!PyUnicode_Check(unicode)) {
3939 PyErr_BadArgument();
3940 goto onError;
3941 }
3942 return PyUnicode_GET_SIZE(unicode);
3943
Benjamin Peterson29060642009-01-31 22:14:21 +00003944 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 return -1;
3946}
3947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948Py_ssize_t
3949PyUnicode_GetLength(PyObject *unicode)
3950{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003951 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 PyErr_BadArgument();
3953 return -1;
3954 }
3955
3956 return PyUnicode_GET_LENGTH(unicode);
3957}
3958
3959Py_UCS4
3960PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3961{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003962 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3963 PyErr_BadArgument();
3964 return (Py_UCS4)-1;
3965 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003966 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003967 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968 return (Py_UCS4)-1;
3969 }
3970 return PyUnicode_READ_CHAR(unicode, index);
3971}
3972
3973int
3974PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3975{
3976 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003977 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 return -1;
3979 }
Victor Stinner488fa492011-12-12 00:01:39 +01003980 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003981 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003982 PyErr_SetString(PyExc_IndexError, "string index out of range");
3983 return -1;
3984 }
Victor Stinner488fa492011-12-12 00:01:39 +01003985 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003986 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003987 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3988 PyErr_SetString(PyExc_ValueError, "character out of range");
3989 return -1;
3990 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3992 index, ch);
3993 return 0;
3994}
3995
Alexander Belopolsky40018472011-02-26 01:02:56 +00003996const char *
3997PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003998{
Victor Stinner42cb4622010-09-01 19:39:01 +00003999 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004000}
4001
Victor Stinner554f3f02010-06-16 23:33:54 +00004002/* create or adjust a UnicodeDecodeError */
4003static void
4004make_decode_exception(PyObject **exceptionObject,
4005 const char *encoding,
4006 const char *input, Py_ssize_t length,
4007 Py_ssize_t startpos, Py_ssize_t endpos,
4008 const char *reason)
4009{
4010 if (*exceptionObject == NULL) {
4011 *exceptionObject = PyUnicodeDecodeError_Create(
4012 encoding, input, length, startpos, endpos, reason);
4013 }
4014 else {
4015 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4016 goto onError;
4017 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4018 goto onError;
4019 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4020 goto onError;
4021 }
4022 return;
4023
4024onError:
4025 Py_DECREF(*exceptionObject);
4026 *exceptionObject = NULL;
4027}
4028
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029/* error handling callback helper:
4030 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004031 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 and adjust various state variables.
4033 return 0 on success, -1 on error
4034*/
4035
Alexander Belopolsky40018472011-02-26 01:02:56 +00004036static int
4037unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004038 const char *encoding, const char *reason,
4039 const char **input, const char **inend, Py_ssize_t *startinpos,
4040 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004041 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004043 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044
4045 PyObject *restuple = NULL;
4046 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004047 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004048 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004049 Py_ssize_t requiredsize;
4050 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004051 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 int res = -1;
4053
Victor Stinner596a6c42011-11-09 00:02:18 +01004054 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4055 outsize = PyUnicode_GET_LENGTH(*output);
4056 else
4057 outsize = _PyUnicode_WSTR_LENGTH(*output);
4058
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 *errorHandler = PyCodec_LookupError(errors);
4061 if (*errorHandler == NULL)
4062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 }
4064
Victor Stinner554f3f02010-06-16 23:33:54 +00004065 make_decode_exception(exceptionObject,
4066 encoding,
4067 *input, *inend - *input,
4068 *startinpos, *endinpos,
4069 reason);
4070 if (*exceptionObject == NULL)
4071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072
4073 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4074 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004077 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 }
4080 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004082 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004083 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004084
4085 /* Copy back the bytes variables, which might have been modified by the
4086 callback */
4087 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4088 if (!inputobj)
4089 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004090 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004092 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004093 *input = PyBytes_AS_STRING(inputobj);
4094 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004095 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004096 /* we can DECREF safely, as the exception has another reference,
4097 so the object won't go away. */
4098 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004099
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004100 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004102 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4104 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004105 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106
Victor Stinner596a6c42011-11-09 00:02:18 +01004107 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4108 /* need more space? (at least enough for what we
4109 have+the replacement+the rest of the string (starting
4110 at the new input position), so we won't have to check space
4111 when there are no errors in the rest of the string) */
4112 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4113 requiredsize = *outpos + replen + insize-newpos;
4114 if (requiredsize > outsize) {
4115 if (requiredsize<2*outsize)
4116 requiredsize = 2*outsize;
4117 if (unicode_resize(output, requiredsize) < 0)
4118 goto onError;
4119 }
4120 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004121 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004122 copy_characters(*output, *outpos, repunicode, 0, replen);
4123 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004125 else {
4126 wchar_t *repwstr;
4127 Py_ssize_t repwlen;
4128 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4129 if (repwstr == NULL)
4130 goto onError;
4131 /* need more space? (at least enough for what we
4132 have+the replacement+the rest of the string (starting
4133 at the new input position), so we won't have to check space
4134 when there are no errors in the rest of the string) */
4135 requiredsize = *outpos + repwlen + insize-newpos;
4136 if (requiredsize > outsize) {
4137 if (requiredsize < 2*outsize)
4138 requiredsize = 2*outsize;
4139 if (unicode_resize(output, requiredsize) < 0)
4140 goto onError;
4141 }
4142 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4143 *outpos += repwlen;
4144 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004146 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 /* we made it! */
4149 res = 0;
4150
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 Py_XDECREF(restuple);
4153 return res;
4154}
4155
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004156/* --- UTF-7 Codec -------------------------------------------------------- */
4157
Antoine Pitrou244651a2009-05-04 18:56:13 +00004158/* See RFC2152 for details. We encode conservatively and decode liberally. */
4159
4160/* Three simple macros defining base-64. */
4161
4162/* Is c a base-64 character? */
4163
4164#define IS_BASE64(c) \
4165 (((c) >= 'A' && (c) <= 'Z') || \
4166 ((c) >= 'a' && (c) <= 'z') || \
4167 ((c) >= '0' && (c) <= '9') || \
4168 (c) == '+' || (c) == '/')
4169
4170/* given that c is a base-64 character, what is its base-64 value? */
4171
4172#define FROM_BASE64(c) \
4173 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4174 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4175 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4176 (c) == '+' ? 62 : 63)
4177
4178/* What is the base-64 character of the bottom 6 bits of n? */
4179
4180#define TO_BASE64(n) \
4181 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4182
4183/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4184 * decoded as itself. We are permissive on decoding; the only ASCII
4185 * byte not decoding to itself is the + which begins a base64
4186 * string. */
4187
4188#define DECODE_DIRECT(c) \
4189 ((c) <= 127 && (c) != '+')
4190
4191/* The UTF-7 encoder treats ASCII characters differently according to
4192 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4193 * the above). See RFC2152. This array identifies these different
4194 * sets:
4195 * 0 : "Set D"
4196 * alphanumeric and '(),-./:?
4197 * 1 : "Set O"
4198 * !"#$%&*;<=>@[]^_`{|}
4199 * 2 : "whitespace"
4200 * ht nl cr sp
4201 * 3 : special (must be base64 encoded)
4202 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4203 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004204
Tim Petersced69f82003-09-16 20:30:58 +00004205static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004206char utf7_category[128] = {
4207/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4208 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4209/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4210 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4211/* sp ! " # $ % & ' ( ) * + , - . / */
4212 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4213/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4214 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4215/* @ A B C D E F G H I J K L M N O */
4216 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4217/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4218 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4219/* ` a b c d e f g h i j k l m n o */
4220 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4221/* p q r s t u v w x y z { | } ~ del */
4222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004223};
4224
Antoine Pitrou244651a2009-05-04 18:56:13 +00004225/* ENCODE_DIRECT: this character should be encoded as itself. The
4226 * answer depends on whether we are encoding set O as itself, and also
4227 * on whether we are encoding whitespace as itself. RFC2152 makes it
4228 * clear that the answers to these questions vary between
4229 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004230
Antoine Pitrou244651a2009-05-04 18:56:13 +00004231#define ENCODE_DIRECT(c, directO, directWS) \
4232 ((c) < 128 && (c) > 0 && \
4233 ((utf7_category[(c)] == 0) || \
4234 (directWS && (utf7_category[(c)] == 2)) || \
4235 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004236
Alexander Belopolsky40018472011-02-26 01:02:56 +00004237PyObject *
4238PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004239 Py_ssize_t size,
4240 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004241{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004242 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4243}
4244
Antoine Pitrou244651a2009-05-04 18:56:13 +00004245/* The decoder. The only state we preserve is our read position,
4246 * i.e. how many characters we have consumed. So if we end in the
4247 * middle of a shift sequence we have to back off the read position
4248 * and the output to the beginning of the sequence, otherwise we lose
4249 * all the shift state (seen bits, number of bits seen, high
4250 * surrogate). */
4251
Alexander Belopolsky40018472011-02-26 01:02:56 +00004252PyObject *
4253PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004254 Py_ssize_t size,
4255 const char *errors,
4256 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004257{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004259 Py_ssize_t startinpos;
4260 Py_ssize_t endinpos;
4261 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004262 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004263 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004264 const char *errmsg = "";
4265 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004266 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004267 unsigned int base64bits = 0;
4268 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004269 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270 PyObject *errorHandler = NULL;
4271 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004272
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004273 /* Start off assuming it's all ASCII. Widen later as necessary. */
4274 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004275 if (!unicode)
4276 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004277 if (size == 0) {
4278 if (consumed)
4279 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004280 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004281 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004283 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004284 e = s + size;
4285
4286 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004287 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004289 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004290
Antoine Pitrou244651a2009-05-04 18:56:13 +00004291 if (inShift) { /* in a base-64 section */
4292 if (IS_BASE64(ch)) { /* consume a base-64 character */
4293 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4294 base64bits += 6;
4295 s++;
4296 if (base64bits >= 16) {
4297 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004298 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004299 base64bits -= 16;
4300 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4301 if (surrogate) {
4302 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004303 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4304 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004305 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4306 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004308 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309 }
4310 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004311 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4312 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004313 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 }
4315 }
Victor Stinner551ac952011-11-29 22:58:13 +01004316 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004317 /* first surrogate */
4318 surrogate = outCh;
4319 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004320 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004321 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4322 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004323 }
4324 }
4325 }
4326 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004327 inShift = 0;
4328 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004329 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004330 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4331 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004332 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334 if (base64bits > 0) { /* left-over bits */
4335 if (base64bits >= 6) {
4336 /* We've seen at least one base-64 character */
4337 errmsg = "partial character in shift sequence";
4338 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004339 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004340 else {
4341 /* Some bits remain; they should be zero */
4342 if (base64buffer != 0) {
4343 errmsg = "non-zero padding bits in shift sequence";
4344 goto utf7Error;
4345 }
4346 }
4347 }
4348 if (ch != '-') {
4349 /* '-' is absorbed; other terminating
4350 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4352 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004354 }
4355 }
4356 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004357 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 s++; /* consume '+' */
4359 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004360 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004361 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4362 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 }
4364 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004365 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004366 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004368 }
4369 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004371 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4372 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004373 s++;
4374 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 else {
4376 startinpos = s-starts;
4377 s++;
4378 errmsg = "unexpected special character";
4379 goto utf7Error;
4380 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 endinpos = s-starts;
4384 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 errors, &errorHandler,
4386 "utf7", errmsg,
4387 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004388 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004389 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390 }
4391
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392 /* end of string */
4393
4394 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4395 /* if we're in an inconsistent state, that's an error */
4396 if (surrogate ||
4397 (base64bits >= 6) ||
4398 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004399 endinpos = size;
4400 if (unicode_decode_call_errorhandler(
4401 errors, &errorHandler,
4402 "utf7", "unterminated shift sequence",
4403 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004404 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004405 goto onError;
4406 if (s < e)
4407 goto restart;
4408 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410
4411 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004412 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004415 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 }
4417 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004418 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004420 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004422 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423 goto onError;
4424
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 Py_XDECREF(errorHandler);
4426 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004427 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 Py_XDECREF(errorHandler);
4431 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432 Py_DECREF(unicode);
4433 return NULL;
4434}
4435
4436
Alexander Belopolsky40018472011-02-26 01:02:56 +00004437PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004438_PyUnicode_EncodeUTF7(PyObject *str,
4439 int base64SetO,
4440 int base64WhiteSpace,
4441 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004443 int kind;
4444 void *data;
4445 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004446 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004447 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004448 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004449 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450 unsigned int base64bits = 0;
4451 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004452 char * out;
4453 char * start;
4454
Benjamin Petersonbac79492012-01-14 13:34:47 -05004455 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004456 return NULL;
4457 kind = PyUnicode_KIND(str);
4458 data = PyUnicode_DATA(str);
4459 len = PyUnicode_GET_LENGTH(str);
4460
4461 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004463
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004464 /* It might be possible to tighten this worst case */
4465 allocated = 8 * len;
4466 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004467 return PyErr_NoMemory();
4468
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 if (v == NULL)
4471 return NULL;
4472
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004473 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004474 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004475 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477 if (inShift) {
4478 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4479 /* shifting out */
4480 if (base64bits) { /* output remaining bits */
4481 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4482 base64buffer = 0;
4483 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 }
4485 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004486 /* Characters not in the BASE64 set implicitly unshift the sequence
4487 so no '-' is required, except if the character is itself a '-' */
4488 if (IS_BASE64(ch) || ch == '-') {
4489 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 *out++ = (char) ch;
4492 }
4493 else {
4494 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004495 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004496 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 else { /* not in a shift sequence */
4498 if (ch == '+') {
4499 *out++ = '+';
4500 *out++ = '-';
4501 }
4502 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4503 *out++ = (char) ch;
4504 }
4505 else {
4506 *out++ = '+';
4507 inShift = 1;
4508 goto encode_char;
4509 }
4510 }
4511 continue;
4512encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004513 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004514 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004515
Antoine Pitrou244651a2009-05-04 18:56:13 +00004516 /* code first surrogate */
4517 base64bits += 16;
4518 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4519 while (base64bits >= 6) {
4520 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4521 base64bits -= 6;
4522 }
4523 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004524 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526 base64bits += 16;
4527 base64buffer = (base64buffer << 16) | ch;
4528 while (base64bits >= 6) {
4529 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4530 base64bits -= 6;
4531 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004532 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 if (base64bits)
4534 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4535 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004537 if (_PyBytes_Resize(&v, out - start) < 0)
4538 return NULL;
4539 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004541PyObject *
4542PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4543 Py_ssize_t size,
4544 int base64SetO,
4545 int base64WhiteSpace,
4546 const char *errors)
4547{
4548 PyObject *result;
4549 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4550 if (tmp == NULL)
4551 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004552 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004553 base64WhiteSpace, errors);
4554 Py_DECREF(tmp);
4555 return result;
4556}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004557
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558#undef IS_BASE64
4559#undef FROM_BASE64
4560#undef TO_BASE64
4561#undef DECODE_DIRECT
4562#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564/* --- UTF-8 Codec -------------------------------------------------------- */
4565
Tim Petersced69f82003-09-16 20:30:58 +00004566static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004568 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4569 illegal prefix. See RFC 3629 for details */
4570 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4571 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004572 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4574 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4575 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4576 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004577 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4578 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4580 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004581 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4582 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4583 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4584 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4585 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586};
4587
Alexander Belopolsky40018472011-02-26 01:02:56 +00004588PyObject *
4589PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004590 Py_ssize_t size,
4591 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592{
Walter Dörwald69652032004-09-07 20:24:22 +00004593 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4594}
4595
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004596#include "stringlib/ucs1lib.h"
4597#include "stringlib/codecs.h"
4598#include "stringlib/undef.h"
4599
4600#include "stringlib/ucs2lib.h"
4601#include "stringlib/codecs.h"
4602#include "stringlib/undef.h"
4603
4604#include "stringlib/ucs4lib.h"
4605#include "stringlib/codecs.h"
4606#include "stringlib/undef.h"
4607
Antoine Pitrouab868312009-01-10 15:40:25 +00004608/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4609#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4610
4611/* Mask to quickly check whether a C 'long' contains a
4612 non-ASCII, UTF8-encoded char. */
4613#if (SIZEOF_LONG == 8)
4614# define ASCII_CHAR_MASK 0x8080808080808080L
4615#elif (SIZEOF_LONG == 4)
4616# define ASCII_CHAR_MASK 0x80808080L
4617#else
4618# error C 'long' size should be either 4 or 8!
4619#endif
4620
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004621/* Scans a UTF-8 string and returns the maximum character to be expected
4622 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004623
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004624 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004625 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004626 */
4627static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004628utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004629{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004630 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004631 const unsigned char *end = p + string_size;
4632 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004633
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004634 assert(unicode_size != NULL);
4635
4636 /* By having a cascade of independent loops which fallback onto each
4637 other, we minimize the amount of work done in the average loop
4638 iteration, and we also maximize the CPU's ability to predict
4639 branches correctly (because a given condition will have always the
4640 same boolean outcome except perhaps in the last iteration of the
4641 corresponding loop).
4642 In the general case this brings us rather close to decoding
4643 performance pre-PEP 393, despite the two-pass decoding.
4644
4645 Note that the pure ASCII loop is not duplicated once a non-ASCII
4646 character has been encountered. It is actually a pessimization (by
4647 a significant factor) to use this loop on text with many non-ASCII
4648 characters, and it is important to avoid bad performance on valid
4649 utf-8 data (invalid utf-8 being a different can of worms).
4650 */
4651
4652 /* ASCII */
4653 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004654 /* Only check value if it's not a ASCII char... */
4655 if (*p < 0x80) {
4656 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4657 an explanation. */
4658 if (!((size_t) p & LONG_PTR_MASK)) {
4659 /* Help register allocation */
4660 register const unsigned char *_p = p;
4661 while (_p < aligned_end) {
4662 unsigned long value = *(unsigned long *) _p;
4663 if (value & ASCII_CHAR_MASK)
4664 break;
4665 _p += SIZEOF_LONG;
4666 char_count += SIZEOF_LONG;
4667 }
4668 p = _p;
4669 if (p == end)
4670 break;
4671 }
4672 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004673 if (*p < 0x80)
4674 ++char_count;
4675 else
4676 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004677 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004678 *unicode_size = char_count;
4679 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004680
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004681_ucs1loop:
4682 for (; p < end; ++p) {
4683 if (*p < 0xc4)
4684 char_count += ((*p & 0xc0) != 0x80);
4685 else
4686 goto _ucs2loop;
4687 }
4688 *unicode_size = char_count;
4689 return 255;
4690
4691_ucs2loop:
4692 for (; p < end; ++p) {
4693 if (*p < 0xf0)
4694 char_count += ((*p & 0xc0) != 0x80);
4695 else
4696 goto _ucs4loop;
4697 }
4698 *unicode_size = char_count;
4699 return 65535;
4700
4701_ucs4loop:
4702 for (; p < end; ++p) {
4703 char_count += ((*p & 0xc0) != 0x80);
4704 }
4705 *unicode_size = char_count;
4706 return 65537;
4707}
4708
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004709/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004710 in case of errors. Implicit parameters: unicode, kind, data, onError.
4711 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004712*/
Victor Stinner785938e2011-12-11 20:09:03 +01004713#define WRITE_MAYBE_FAIL(index, value) \
4714 do { \
4715 Py_ssize_t pos = index; \
4716 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4717 unicode_resize(&unicode, pos + pos/8) < 0) \
4718 goto onError; \
4719 if (unicode_putchar(&unicode, &pos, value) < 0) \
4720 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004721 } while (0)
4722
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004723static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004724decode_utf8_errors(const char *starts,
4725 Py_ssize_t size,
4726 const char *errors,
4727 Py_ssize_t *consumed,
4728 const char *s,
4729 PyObject *unicode,
4730 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004731{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004733 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004734 Py_ssize_t startinpos;
4735 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004736 const char *e = starts + size;
4737 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004738 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 PyObject *errorHandler = NULL;
4740 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004741
Antoine Pitrouab868312009-01-10 15:40:25 +00004742 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743
4744 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004745 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746
4747 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004748 /* Fast path for runs of ASCII characters. Given that common UTF-8
4749 input will consist of an overwhelming majority of ASCII
4750 characters, we try to optimize for this case by checking
4751 as many characters as a C 'long' can contain.
4752 First, check if we can do an aligned read, as most CPUs have
4753 a penalty for unaligned reads.
4754 */
4755 if (!((size_t) s & LONG_PTR_MASK)) {
4756 /* Help register allocation */
4757 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004758 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004759 while (_s < aligned_end) {
4760 /* Read a whole long at a time (either 4 or 8 bytes),
4761 and do a fast unrolled copy if it only contains ASCII
4762 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004763 unsigned long value = *(unsigned long *) _s;
4764 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004765 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004766 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4767 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4768 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4769 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004770#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004771 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4772 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4773 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4774 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004775#endif
4776 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004778 }
4779 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004780 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004781 if (s == e)
4782 break;
4783 ch = (unsigned char)*s;
4784 }
4785 }
4786
4787 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004788 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 s++;
4790 continue;
4791 }
4792
4793 n = utf8_code_length[ch];
4794
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004795 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004796 if (consumed)
4797 break;
4798 else {
4799 errmsg = "unexpected end of data";
4800 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004801 endinpos = startinpos+1;
4802 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4803 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004804 goto utf8Error;
4805 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004806 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
4808 switch (n) {
4809
4810 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004811 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004812 startinpos = s-starts;
4813 endinpos = startinpos+1;
4814 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815
4816 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004817 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 startinpos = s-starts;
4819 endinpos = startinpos+1;
4820 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821
4822 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004823 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004824 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004826 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004827 goto utf8Error;
4828 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004830 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004831 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 break;
4833
4834 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004835 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4836 will result in surrogates in range d800-dfff. Surrogates are
4837 not valid UTF-8 so they are rejected.
4838 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4839 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004840 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004841 (s[2] & 0xc0) != 0x80 ||
4842 ((unsigned char)s[0] == 0xE0 &&
4843 (unsigned char)s[1] < 0xA0) ||
4844 ((unsigned char)s[0] == 0xED &&
4845 (unsigned char)s[1] > 0x9F)) {
4846 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004847 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004848 endinpos = startinpos + 1;
4849
4850 /* if s[1] first two bits are 1 and 0, then the invalid
4851 continuation byte is s[2], so increment endinpos by 1,
4852 if not, s[1] is invalid and endinpos doesn't need to
4853 be incremented. */
4854 if ((s[1] & 0xC0) == 0x80)
4855 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004856 goto utf8Error;
4857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004859 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004860 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004861 break;
4862
4863 case 4:
4864 if ((s[1] & 0xc0) != 0x80 ||
4865 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004866 (s[3] & 0xc0) != 0x80 ||
4867 ((unsigned char)s[0] == 0xF0 &&
4868 (unsigned char)s[1] < 0x90) ||
4869 ((unsigned char)s[0] == 0xF4 &&
4870 (unsigned char)s[1] > 0x8F)) {
4871 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004872 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004873 endinpos = startinpos + 1;
4874 if ((s[1] & 0xC0) == 0x80) {
4875 endinpos++;
4876 if ((s[2] & 0xC0) == 0x80)
4877 endinpos++;
4878 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004879 goto utf8Error;
4880 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004881 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004882 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004883 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004884
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004885 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887 }
4888 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004889 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004890
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 if (unicode_decode_call_errorhandler(
4893 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004894 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004895 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004896 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004898 /* Update data because unicode_decode_call_errorhandler might have
4899 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 }
Walter Dörwald69652032004-09-07 20:24:22 +00004902 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004905 /* Adjust length and ready string when it contained errors and
4906 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004907 if (unicode_resize(&unicode, i) < 0)
4908 goto onError;
4909 unicode_adjust_maxchar(&unicode);
4910 if (unicode == NULL)
4911 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004913 Py_XDECREF(errorHandler);
4914 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004915 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004916 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917
Benjamin Peterson29060642009-01-31 22:14:21 +00004918 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004919 Py_XDECREF(errorHandler);
4920 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004921 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 return NULL;
4923}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004924#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004925
Victor Stinner785938e2011-12-11 20:09:03 +01004926PyObject *
4927PyUnicode_DecodeUTF8Stateful(const char *s,
4928 Py_ssize_t size,
4929 const char *errors,
4930 Py_ssize_t *consumed)
4931{
4932 Py_UCS4 maxchar = 0;
4933 Py_ssize_t unicode_size;
4934 int has_errors = 0;
4935 PyObject *unicode;
4936 int kind;
4937 void *data;
4938 const char *starts = s;
4939 const char *e;
4940 Py_ssize_t i;
4941
4942 if (size == 0) {
4943 if (consumed)
4944 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004945 Py_INCREF(unicode_empty);
4946 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004947 }
4948
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004949 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004950
4951 /* When the string is ASCII only, just use memcpy and return.
4952 unicode_size may be != size if there is an incomplete UTF-8
4953 sequence at the end of the ASCII block. */
4954 if (maxchar < 128 && size == unicode_size) {
4955 if (consumed)
4956 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004957 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004958 }
4959
4960 unicode = PyUnicode_New(unicode_size, maxchar);
4961 if (!unicode)
4962 return NULL;
4963 kind = PyUnicode_KIND(unicode);
4964 data = PyUnicode_DATA(unicode);
4965
4966 /* Unpack UTF-8 encoded data */
4967 i = 0;
4968 e = starts + size;
4969 switch (kind) {
4970 case PyUnicode_1BYTE_KIND:
4971 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4972 break;
4973 case PyUnicode_2BYTE_KIND:
4974 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4975 break;
4976 case PyUnicode_4BYTE_KIND:
4977 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4978 break;
4979 }
4980 if (!has_errors) {
4981 /* Ensure the unicode size calculation was correct */
4982 assert(i == unicode_size);
4983 assert(s == e);
4984 if (consumed)
4985 *consumed = size;
4986 return unicode;
4987 }
4988
4989 /* In case of errors, maxchar and size computation might be incorrect;
4990 code below refits and resizes as necessary. */
4991 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4992}
4993
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004994#ifdef __APPLE__
4995
4996/* Simplified UTF-8 decoder using surrogateescape error handler,
4997 used to decode the command line arguments on Mac OS X. */
4998
4999wchar_t*
5000_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5001{
5002 int n;
5003 const char *e;
5004 wchar_t *unicode, *p;
5005
5006 /* Note: size will always be longer than the resulting Unicode
5007 character count */
5008 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
5009 PyErr_NoMemory();
5010 return NULL;
5011 }
5012 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
5013 if (!unicode)
5014 return NULL;
5015
5016 /* Unpack UTF-8 encoded data */
5017 p = unicode;
5018 e = s + size;
5019 while (s < e) {
5020 Py_UCS4 ch = (unsigned char)*s;
5021
5022 if (ch < 0x80) {
5023 *p++ = (wchar_t)ch;
5024 s++;
5025 continue;
5026 }
5027
5028 n = utf8_code_length[ch];
5029 if (s + n > e) {
5030 goto surrogateescape;
5031 }
5032
5033 switch (n) {
5034 case 0:
5035 case 1:
5036 goto surrogateescape;
5037
5038 case 2:
5039 if ((s[1] & 0xc0) != 0x80)
5040 goto surrogateescape;
5041 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5042 assert ((ch > 0x007F) && (ch <= 0x07FF));
5043 *p++ = (wchar_t)ch;
5044 break;
5045
5046 case 3:
5047 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5048 will result in surrogates in range d800-dfff. Surrogates are
5049 not valid UTF-8 so they are rejected.
5050 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5051 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5052 if ((s[1] & 0xc0) != 0x80 ||
5053 (s[2] & 0xc0) != 0x80 ||
5054 ((unsigned char)s[0] == 0xE0 &&
5055 (unsigned char)s[1] < 0xA0) ||
5056 ((unsigned char)s[0] == 0xED &&
5057 (unsigned char)s[1] > 0x9F)) {
5058
5059 goto surrogateescape;
5060 }
5061 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5062 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005063 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005064 break;
5065
5066 case 4:
5067 if ((s[1] & 0xc0) != 0x80 ||
5068 (s[2] & 0xc0) != 0x80 ||
5069 (s[3] & 0xc0) != 0x80 ||
5070 ((unsigned char)s[0] == 0xF0 &&
5071 (unsigned char)s[1] < 0x90) ||
5072 ((unsigned char)s[0] == 0xF4 &&
5073 (unsigned char)s[1] > 0x8F)) {
5074 goto surrogateescape;
5075 }
5076 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5077 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005078 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005079
5080#if SIZEOF_WCHAR_T == 4
5081 *p++ = (wchar_t)ch;
5082#else
5083 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005084 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5085 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005086#endif
5087 break;
5088 }
5089 s += n;
5090 continue;
5091
5092 surrogateescape:
5093 *p++ = 0xDC00 + ch;
5094 s++;
5095 }
5096 *p = L'\0';
5097 return unicode;
5098}
5099
5100#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005102/* Primary internal function which creates utf8 encoded bytes objects.
5103
5104 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005105 and allocate exactly as much space needed at the end. Else allocate the
5106 maximum possible needed (4 result bytes per Unicode character), and return
5107 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005108*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005109PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005110_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111{
Victor Stinner6099a032011-12-18 14:22:26 +01005112 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005113 void *data;
5114 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005116 if (!PyUnicode_Check(unicode)) {
5117 PyErr_BadArgument();
5118 return NULL;
5119 }
5120
5121 if (PyUnicode_READY(unicode) == -1)
5122 return NULL;
5123
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005124 if (PyUnicode_UTF8(unicode))
5125 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5126 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005127
5128 kind = PyUnicode_KIND(unicode);
5129 data = PyUnicode_DATA(unicode);
5130 size = PyUnicode_GET_LENGTH(unicode);
5131
Benjamin Petersonead6b532011-12-20 17:23:42 -06005132 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005133 default:
5134 assert(0);
5135 case PyUnicode_1BYTE_KIND:
5136 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5137 assert(!PyUnicode_IS_ASCII(unicode));
5138 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5139 case PyUnicode_2BYTE_KIND:
5140 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5141 case PyUnicode_4BYTE_KIND:
5142 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005143 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144}
5145
Alexander Belopolsky40018472011-02-26 01:02:56 +00005146PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005147PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5148 Py_ssize_t size,
5149 const char *errors)
5150{
5151 PyObject *v, *unicode;
5152
5153 unicode = PyUnicode_FromUnicode(s, size);
5154 if (unicode == NULL)
5155 return NULL;
5156 v = _PyUnicode_AsUTF8String(unicode, errors);
5157 Py_DECREF(unicode);
5158 return v;
5159}
5160
5161PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005162PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005164 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165}
5166
Walter Dörwald41980ca2007-08-16 21:55:45 +00005167/* --- UTF-32 Codec ------------------------------------------------------- */
5168
5169PyObject *
5170PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005171 Py_ssize_t size,
5172 const char *errors,
5173 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005174{
5175 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5176}
5177
5178PyObject *
5179PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 Py_ssize_t size,
5181 const char *errors,
5182 int *byteorder,
5183 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005184{
5185 const char *starts = s;
5186 Py_ssize_t startinpos;
5187 Py_ssize_t endinpos;
5188 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005189 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005190 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005191 int bo = 0; /* assume native ordering by default */
5192 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005193 /* Offsets from q for retrieving bytes in the right order. */
5194#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5195 int iorder[] = {0, 1, 2, 3};
5196#else
5197 int iorder[] = {3, 2, 1, 0};
5198#endif
5199 PyObject *errorHandler = NULL;
5200 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005201
Walter Dörwald41980ca2007-08-16 21:55:45 +00005202 q = (unsigned char *)s;
5203 e = q + size;
5204
5205 if (byteorder)
5206 bo = *byteorder;
5207
5208 /* Check for BOM marks (U+FEFF) in the input and adjust current
5209 byte order setting accordingly. In native mode, the leading BOM
5210 mark is skipped, in all other modes, it is copied to the output
5211 stream as-is (giving a ZWNBSP character). */
5212 if (bo == 0) {
5213 if (size >= 4) {
5214 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005215 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005216#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005217 if (bom == 0x0000FEFF) {
5218 q += 4;
5219 bo = -1;
5220 }
5221 else if (bom == 0xFFFE0000) {
5222 q += 4;
5223 bo = 1;
5224 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 if (bom == 0x0000FEFF) {
5227 q += 4;
5228 bo = 1;
5229 }
5230 else if (bom == 0xFFFE0000) {
5231 q += 4;
5232 bo = -1;
5233 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005234#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236 }
5237
5238 if (bo == -1) {
5239 /* force LE */
5240 iorder[0] = 0;
5241 iorder[1] = 1;
5242 iorder[2] = 2;
5243 iorder[3] = 3;
5244 }
5245 else if (bo == 1) {
5246 /* force BE */
5247 iorder[0] = 3;
5248 iorder[1] = 2;
5249 iorder[2] = 1;
5250 iorder[3] = 0;
5251 }
5252
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005253 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005254 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005255 if (!unicode)
5256 return NULL;
5257 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005258 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005259 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005260
Walter Dörwald41980ca2007-08-16 21:55:45 +00005261 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 Py_UCS4 ch;
5263 /* remaining bytes at the end? (size should be divisible by 4) */
5264 if (e-q<4) {
5265 if (consumed)
5266 break;
5267 errmsg = "truncated data";
5268 startinpos = ((const char *)q)-starts;
5269 endinpos = ((const char *)e)-starts;
5270 goto utf32Error;
5271 /* The remaining input chars are ignored if the callback
5272 chooses to skip the input */
5273 }
5274 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5275 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 if (ch >= 0x110000)
5278 {
5279 errmsg = "codepoint not in range(0x110000)";
5280 startinpos = ((const char *)q)-starts;
5281 endinpos = startinpos+4;
5282 goto utf32Error;
5283 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005284 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5285 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005286 q += 4;
5287 continue;
5288 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 if (unicode_decode_call_errorhandler(
5290 errors, &errorHandler,
5291 "utf32", errmsg,
5292 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005293 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005294 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005295 }
5296
5297 if (byteorder)
5298 *byteorder = bo;
5299
5300 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005302
5303 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005304 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005305 goto onError;
5306
5307 Py_XDECREF(errorHandler);
5308 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005309 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005310
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005312 Py_DECREF(unicode);
5313 Py_XDECREF(errorHandler);
5314 Py_XDECREF(exc);
5315 return NULL;
5316}
5317
5318PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005319_PyUnicode_EncodeUTF32(PyObject *str,
5320 const char *errors,
5321 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005322{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005323 int kind;
5324 void *data;
5325 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005326 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005327 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005328 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005329 /* Offsets from p for storing byte pairs in the right order. */
5330#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5331 int iorder[] = {0, 1, 2, 3};
5332#else
5333 int iorder[] = {3, 2, 1, 0};
5334#endif
5335
Benjamin Peterson29060642009-01-31 22:14:21 +00005336#define STORECHAR(CH) \
5337 do { \
5338 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5339 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5340 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5341 p[iorder[0]] = (CH) & 0xff; \
5342 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005343 } while(0)
5344
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005345 if (!PyUnicode_Check(str)) {
5346 PyErr_BadArgument();
5347 return NULL;
5348 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005349 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005350 return NULL;
5351 kind = PyUnicode_KIND(str);
5352 data = PyUnicode_DATA(str);
5353 len = PyUnicode_GET_LENGTH(str);
5354
5355 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005356 bytesize = nsize * 4;
5357 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005359 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005360 if (v == NULL)
5361 return NULL;
5362
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005363 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005364 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005366 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005367 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005368
5369 if (byteorder == -1) {
5370 /* force LE */
5371 iorder[0] = 0;
5372 iorder[1] = 1;
5373 iorder[2] = 2;
5374 iorder[3] = 3;
5375 }
5376 else if (byteorder == 1) {
5377 /* force BE */
5378 iorder[0] = 3;
5379 iorder[1] = 2;
5380 iorder[2] = 1;
5381 iorder[3] = 0;
5382 }
5383
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005384 for (i = 0; i < len; i++)
5385 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005386
5387 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005388 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005389#undef STORECHAR
5390}
5391
Alexander Belopolsky40018472011-02-26 01:02:56 +00005392PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005393PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5394 Py_ssize_t size,
5395 const char *errors,
5396 int byteorder)
5397{
5398 PyObject *result;
5399 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5400 if (tmp == NULL)
5401 return NULL;
5402 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5403 Py_DECREF(tmp);
5404 return result;
5405}
5406
5407PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005408PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005409{
Victor Stinnerb960b342011-11-20 19:12:52 +01005410 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005411}
5412
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413/* --- UTF-16 Codec ------------------------------------------------------- */
5414
Tim Peters772747b2001-08-09 22:21:55 +00005415PyObject *
5416PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 Py_ssize_t size,
5418 const char *errors,
5419 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420{
Walter Dörwald69652032004-09-07 20:24:22 +00005421 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5422}
5423
Antoine Pitrouab868312009-01-10 15:40:25 +00005424/* Two masks for fast checking of whether a C 'long' may contain
5425 UTF16-encoded surrogate characters. This is an efficient heuristic,
5426 assuming that non-surrogate characters with a code point >= 0x8000 are
5427 rare in most input.
5428 FAST_CHAR_MASK is used when the input is in native byte ordering,
5429 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005430*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005431#if (SIZEOF_LONG == 8)
5432# define FAST_CHAR_MASK 0x8000800080008000L
5433# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005434# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005435#elif (SIZEOF_LONG == 4)
5436# define FAST_CHAR_MASK 0x80008000L
5437# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005438# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005439#else
5440# error C 'long' size should be either 4 or 8!
5441#endif
5442
Walter Dörwald69652032004-09-07 20:24:22 +00005443PyObject *
5444PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 Py_ssize_t size,
5446 const char *errors,
5447 int *byteorder,
5448 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005449{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005450 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005451 Py_ssize_t startinpos;
5452 Py_ssize_t endinpos;
5453 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005454 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005455 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005456 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005457 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005458 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005459 /* Offsets from q for retrieving byte pairs in the right order. */
5460#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5461 int ihi = 1, ilo = 0;
5462#else
5463 int ihi = 0, ilo = 1;
5464#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005465 PyObject *errorHandler = NULL;
5466 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467
5468 /* Note: size will always be longer than the resulting Unicode
5469 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005470 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 if (!unicode)
5472 return NULL;
5473 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005474 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005475 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476
Tim Peters772747b2001-08-09 22:21:55 +00005477 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005478 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479
5480 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005481 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005483 /* Check for BOM marks (U+FEFF) in the input and adjust current
5484 byte order setting accordingly. In native mode, the leading BOM
5485 mark is skipped, in all other modes, it is copied to the output
5486 stream as-is (giving a ZWNBSP character). */
5487 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005488 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005489 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005490#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 if (bom == 0xFEFF) {
5492 q += 2;
5493 bo = -1;
5494 }
5495 else if (bom == 0xFFFE) {
5496 q += 2;
5497 bo = 1;
5498 }
Tim Petersced69f82003-09-16 20:30:58 +00005499#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 if (bom == 0xFEFF) {
5501 q += 2;
5502 bo = 1;
5503 }
5504 else if (bom == 0xFFFE) {
5505 q += 2;
5506 bo = -1;
5507 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005508#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511
Tim Peters772747b2001-08-09 22:21:55 +00005512 if (bo == -1) {
5513 /* force LE */
5514 ihi = 1;
5515 ilo = 0;
5516 }
5517 else if (bo == 1) {
5518 /* force BE */
5519 ihi = 0;
5520 ilo = 1;
5521 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005522#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5523 native_ordering = ilo < ihi;
5524#else
5525 native_ordering = ilo > ihi;
5526#endif
Tim Peters772747b2001-08-09 22:21:55 +00005527
Antoine Pitrouab868312009-01-10 15:40:25 +00005528 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005529 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005530 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005531 /* First check for possible aligned read of a C 'long'. Unaligned
5532 reads are more expensive, better to defer to another iteration. */
5533 if (!((size_t) q & LONG_PTR_MASK)) {
5534 /* Fast path for runs of non-surrogate chars. */
5535 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005536 int kind = PyUnicode_KIND(unicode);
5537 void *data = PyUnicode_DATA(unicode);
5538 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005539 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005540 Py_UCS4 maxch;
5541 if (native_ordering) {
5542 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005543 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005544 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005545 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005546 else {
5547 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005548 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005549 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005550 block = ((block >> 8) & STRIPPED_MASK) |
5551 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005552 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005553 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005554#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005555 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
5556 maxch = Py_MAX(maxch, ch);
5557 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
5558 maxch = Py_MAX(maxch, ch);
5559 ch = (Py_UCS2)(block >> 48);
5560 maxch = Py_MAX(maxch, ch);
5561#else
5562 ch = (Py_UCS2)(block >> 16);
5563 maxch = Py_MAX(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005564#endif
5565 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5566 if (unicode_widen(&unicode, maxch) < 0)
5567 goto onError;
5568 kind = PyUnicode_KIND(unicode);
5569 data = PyUnicode_DATA(unicode);
5570 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005571#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5572 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005573#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005574 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5575 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5576 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5577#else
5578 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5579#endif
5580#else
5581#if SIZEOF_LONG == 8
5582 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5583 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5584 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5585#else
5586 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5587#endif
5588 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005589#endif
5590 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005591 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005592 q = _q;
5593 if (q >= e)
5594 break;
5595 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597
Benjamin Peterson14339b62009-01-31 16:36:08 +00005598 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005599
Victor Stinner551ac952011-11-29 22:58:13 +01005600 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005601 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5602 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 continue;
5604 }
5605
5606 /* UTF-16 code pair: */
5607 if (q > e) {
5608 errmsg = "unexpected end of data";
5609 startinpos = (((const char *)q) - 2) - starts;
5610 endinpos = ((const char *)e) + 1 - starts;
5611 goto utf16Error;
5612 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005613 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5614 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005615 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005616 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005617 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005618 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005619 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 continue;
5621 }
5622 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005623 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 startinpos = (((const char *)q)-4)-starts;
5625 endinpos = startinpos+2;
5626 goto utf16Error;
5627 }
5628
Benjamin Peterson14339b62009-01-31 16:36:08 +00005629 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 errmsg = "illegal encoding";
5631 startinpos = (((const char *)q)-2)-starts;
5632 endinpos = startinpos+2;
5633 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005634
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005637 errors,
5638 &errorHandler,
5639 "utf16", errmsg,
5640 &starts,
5641 (const char **)&e,
5642 &startinpos,
5643 &endinpos,
5644 &exc,
5645 (const char **)&q,
5646 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005647 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005650 /* remaining byte at the end? (size should be even) */
5651 if (e == q) {
5652 if (!consumed) {
5653 errmsg = "truncated data";
5654 startinpos = ((const char *)q) - starts;
5655 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005656 if (unicode_decode_call_errorhandler(
5657 errors,
5658 &errorHandler,
5659 "utf16", errmsg,
5660 &starts,
5661 (const char **)&e,
5662 &startinpos,
5663 &endinpos,
5664 &exc,
5665 (const char **)&q,
5666 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005667 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005668 goto onError;
5669 /* The remaining input chars are ignored if the callback
5670 chooses to skip the input */
5671 }
5672 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673
5674 if (byteorder)
5675 *byteorder = bo;
5676
Walter Dörwald69652032004-09-07 20:24:22 +00005677 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005679
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005681 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 goto onError;
5683
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005684 Py_XDECREF(errorHandler);
5685 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005686 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005690 Py_XDECREF(errorHandler);
5691 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 return NULL;
5693}
5694
Antoine Pitrouab868312009-01-10 15:40:25 +00005695#undef FAST_CHAR_MASK
5696#undef SWAPPED_FAST_CHAR_MASK
5697
Tim Peters772747b2001-08-09 22:21:55 +00005698PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005699_PyUnicode_EncodeUTF16(PyObject *str,
5700 const char *errors,
5701 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005703 int kind;
5704 void *data;
5705 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005706 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005707 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005708 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005709 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005710 /* Offsets from p for storing byte pairs in the right order. */
5711#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5712 int ihi = 1, ilo = 0;
5713#else
5714 int ihi = 0, ilo = 1;
5715#endif
5716
Benjamin Peterson29060642009-01-31 22:14:21 +00005717#define STORECHAR(CH) \
5718 do { \
5719 p[ihi] = ((CH) >> 8) & 0xff; \
5720 p[ilo] = (CH) & 0xff; \
5721 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005722 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005724 if (!PyUnicode_Check(str)) {
5725 PyErr_BadArgument();
5726 return NULL;
5727 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005728 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005729 return NULL;
5730 kind = PyUnicode_KIND(str);
5731 data = PyUnicode_DATA(str);
5732 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005733
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005734 pairs = 0;
5735 if (kind == PyUnicode_4BYTE_KIND)
5736 for (i = 0; i < len; i++)
5737 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5738 pairs++;
5739 /* 2 * (len + pairs + (byteorder == 0)) */
5740 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005742 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005743 bytesize = nsize * 2;
5744 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005746 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 if (v == NULL)
5748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005750 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005753 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005754 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005755
5756 if (byteorder == -1) {
5757 /* force LE */
5758 ihi = 1;
5759 ilo = 0;
5760 }
5761 else if (byteorder == 1) {
5762 /* force BE */
5763 ihi = 0;
5764 ilo = 1;
5765 }
5766
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005767 for (i = 0; i < len; i++) {
5768 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5769 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005771 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5772 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 }
Tim Peters772747b2001-08-09 22:21:55 +00005774 STORECHAR(ch);
5775 if (ch2)
5776 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005777 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005778
5779 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005780 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005781#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782}
5783
Alexander Belopolsky40018472011-02-26 01:02:56 +00005784PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005785PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5786 Py_ssize_t size,
5787 const char *errors,
5788 int byteorder)
5789{
5790 PyObject *result;
5791 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5792 if (tmp == NULL)
5793 return NULL;
5794 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5795 Py_DECREF(tmp);
5796 return result;
5797}
5798
5799PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005800PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005802 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803}
5804
5805/* --- Unicode Escape Codec ----------------------------------------------- */
5806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005807/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5808 if all the escapes in the string make it still a valid ASCII string.
5809 Returns -1 if any escapes were found which cause the string to
5810 pop out of ASCII range. Otherwise returns the length of the
5811 required buffer to hold the string.
5812 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005813static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005814length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5815{
5816 const unsigned char *p = (const unsigned char *)s;
5817 const unsigned char *end = p + size;
5818 Py_ssize_t length = 0;
5819
5820 if (size < 0)
5821 return -1;
5822
5823 for (; p < end; ++p) {
5824 if (*p > 127) {
5825 /* Non-ASCII */
5826 return -1;
5827 }
5828 else if (*p != '\\') {
5829 /* Normal character */
5830 ++length;
5831 }
5832 else {
5833 /* Backslash-escape, check next char */
5834 ++p;
5835 /* Escape sequence reaches till end of string or
5836 non-ASCII follow-up. */
5837 if (p >= end || *p > 127)
5838 return -1;
5839 switch (*p) {
5840 case '\n':
5841 /* backslash + \n result in zero characters */
5842 break;
5843 case '\\': case '\'': case '\"':
5844 case 'b': case 'f': case 't':
5845 case 'n': case 'r': case 'v': case 'a':
5846 ++length;
5847 break;
5848 case '0': case '1': case '2': case '3':
5849 case '4': case '5': case '6': case '7':
5850 case 'x': case 'u': case 'U': case 'N':
5851 /* these do not guarantee ASCII characters */
5852 return -1;
5853 default:
5854 /* count the backslash + the other character */
5855 length += 2;
5856 }
5857 }
5858 }
5859 return length;
5860}
5861
Fredrik Lundh06d12682001-01-24 07:59:11 +00005862static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005863
Alexander Belopolsky40018472011-02-26 01:02:56 +00005864PyObject *
5865PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005866 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005867 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005869 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005870 Py_ssize_t startinpos;
5871 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005872 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005873 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005875 char* message;
5876 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877 PyObject *errorHandler = NULL;
5878 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005879 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005880 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005881
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005882 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005883
5884 /* After length_of_escaped_ascii_string() there are two alternatives,
5885 either the string is pure ASCII with named escapes like \n, etc.
5886 and we determined it's exact size (common case)
5887 or it contains \x, \u, ... escape sequences. then we create a
5888 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005889 if (len >= 0) {
5890 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005891 if (!v)
5892 goto onError;
5893 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005894 }
5895 else {
5896 /* Escaped strings will always be longer than the resulting
5897 Unicode string, so we start with size here and then reduce the
5898 length after conversion to the true value.
5899 (but if the error callback returns a long replacement string
5900 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005901 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005902 if (!v)
5903 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005904 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005905 }
5906
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005908 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005909 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005911
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 while (s < end) {
5913 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005914 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005915 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005917 /* The only case in which i == ascii_length is a backslash
5918 followed by a newline. */
5919 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005920
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 /* Non-escape characters are interpreted as Unicode ordinals */
5922 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005923 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5924 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 continue;
5926 }
5927
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005928 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 /* \ - Escapes */
5930 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005931 c = *s++;
5932 if (s > end)
5933 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005934
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005935 /* The only case in which i == ascii_length is a backslash
5936 followed by a newline. */
5937 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005938
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005939 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005942#define WRITECHAR(ch) \
5943 do { \
5944 if (unicode_putchar(&v, &i, ch) < 0) \
5945 goto onError; \
5946 }while(0)
5947
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005949 case '\\': WRITECHAR('\\'); break;
5950 case '\'': WRITECHAR('\''); break;
5951 case '\"': WRITECHAR('\"'); break;
5952 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005953 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005954 case 'f': WRITECHAR('\014'); break;
5955 case 't': WRITECHAR('\t'); break;
5956 case 'n': WRITECHAR('\n'); break;
5957 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005958 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005959 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005960 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005961 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 case '0': case '1': case '2': case '3':
5965 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005966 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005967 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005968 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005969 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005970 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005972 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 break;
5974
Benjamin Peterson29060642009-01-31 22:14:21 +00005975 /* hex escapes */
5976 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005978 digits = 2;
5979 message = "truncated \\xXX escape";
5980 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005984 digits = 4;
5985 message = "truncated \\uXXXX escape";
5986 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005989 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005990 digits = 8;
5991 message = "truncated \\UXXXXXXXX escape";
5992 hexescape:
5993 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 if (s+digits>end) {
5995 endinpos = size;
5996 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 errors, &errorHandler,
5998 "unicodeescape", "end of string in escape sequence",
5999 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006000 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006001 goto onError;
6002 goto nextByte;
6003 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006004 for (j = 0; j < digits; ++j) {
6005 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00006006 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006007 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006008 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 errors, &errorHandler,
6010 "unicodeescape", message,
6011 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006012 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006013 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006014 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006016 }
6017 chr = (chr<<4) & ~0xF;
6018 if (c >= '0' && c <= '9')
6019 chr += c - '0';
6020 else if (c >= 'a' && c <= 'f')
6021 chr += 10 + c - 'a';
6022 else
6023 chr += 10 + c - 'A';
6024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006025 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006026 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006027 /* _decoding_error will have already written into the
6028 target buffer. */
6029 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006030 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006031 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01006032 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006033 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00006034 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006035 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006036 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 errors, &errorHandler,
6038 "unicodeescape", "illegal Unicode character",
6039 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006040 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006041 goto onError;
6042 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006043 break;
6044
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006046 case 'N':
6047 message = "malformed \\N character escape";
6048 if (ucnhash_CAPI == NULL) {
6049 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006050 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6051 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006052 if (ucnhash_CAPI == NULL)
6053 goto ucnhashError;
6054 }
6055 if (*s == '{') {
6056 const char *start = s+1;
6057 /* look for the closing brace */
6058 while (*s != '}' && s < end)
6059 s++;
6060 if (s > start && s < end && *s == '}') {
6061 /* found a name. look it up in the unicode database */
6062 message = "unknown Unicode character name";
6063 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006064 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006065 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006066 goto store;
6067 }
6068 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006069 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006070 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 errors, &errorHandler,
6072 "unicodeescape", message,
6073 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006074 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006075 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006076 break;
6077
6078 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006079 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080 message = "\\ at end of string";
6081 s--;
6082 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006083 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 errors, &errorHandler,
6085 "unicodeescape", message,
6086 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006087 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006088 goto onError;
6089 }
6090 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006091 WRITECHAR('\\');
6092 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006093 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006094 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006097 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006099#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006100
Victor Stinner16e6a802011-12-12 13:24:15 +01006101 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006102 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006103 Py_XDECREF(errorHandler);
6104 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006105 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006106
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006108 PyErr_SetString(
6109 PyExc_UnicodeError,
6110 "\\N escapes not supported (can't load unicodedata module)"
6111 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006112 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006113 Py_XDECREF(errorHandler);
6114 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006115 return NULL;
6116
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119 Py_XDECREF(errorHandler);
6120 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 return NULL;
6122}
6123
6124/* Return a Unicode-Escape string version of the Unicode object.
6125
6126 If quotes is true, the string is enclosed in u"" or u'' quotes as
6127 appropriate.
6128
6129*/
6130
Alexander Belopolsky40018472011-02-26 01:02:56 +00006131PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006132PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006134 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006135 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006137 int kind;
6138 void *data;
6139 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140
Thomas Wouters89f507f2006-12-13 04:49:30 +00006141 /* Initial allocation is based on the longest-possible unichr
6142 escape.
6143
6144 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6145 unichr, so in this case it's the longest unichr escape. In
6146 narrow (UTF-16) builds this is five chars per source unichr
6147 since there are two unichrs in the surrogate pair, so in narrow
6148 (UTF-16) builds it's not the longest unichr escape.
6149
6150 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6151 so in the narrow (UTF-16) build case it's the longest unichr
6152 escape.
6153 */
6154
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155 if (!PyUnicode_Check(unicode)) {
6156 PyErr_BadArgument();
6157 return NULL;
6158 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006159 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006160 return NULL;
6161 len = PyUnicode_GET_LENGTH(unicode);
6162 kind = PyUnicode_KIND(unicode);
6163 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006164 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6166 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6167 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6168 }
6169
6170 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006171 return PyBytes_FromStringAndSize(NULL, 0);
6172
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006175
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006176 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 if (repr == NULL)
6181 return NULL;
6182
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006183 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006186 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006187
Walter Dörwald79e913e2007-05-12 11:08:06 +00006188 /* Escape backslashes */
6189 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 *p++ = '\\';
6191 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006192 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006193 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006194
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006195 /* Map 21-bit characters to '\U00xxxxxx' */
6196 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006197 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006198 *p++ = '\\';
6199 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006200 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6201 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6202 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6203 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6204 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6205 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6206 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6207 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006209 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006210
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006212 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 *p++ = '\\';
6214 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006215 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6216 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6217 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6218 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006220
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006221 /* Map special whitespace to '\t', \n', '\r' */
6222 else if (ch == '\t') {
6223 *p++ = '\\';
6224 *p++ = 't';
6225 }
6226 else if (ch == '\n') {
6227 *p++ = '\\';
6228 *p++ = 'n';
6229 }
6230 else if (ch == '\r') {
6231 *p++ = '\\';
6232 *p++ = 'r';
6233 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006234
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006235 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006236 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006238 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006239 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6240 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006241 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006242
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 /* Copy everything else as-is */
6244 else
6245 *p++ = (char) ch;
6246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006248 assert(p - PyBytes_AS_STRING(repr) > 0);
6249 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6250 return NULL;
6251 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252}
6253
Alexander Belopolsky40018472011-02-26 01:02:56 +00006254PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006255PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6256 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006258 PyObject *result;
6259 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6260 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006262 result = PyUnicode_AsUnicodeEscapeString(tmp);
6263 Py_DECREF(tmp);
6264 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265}
6266
6267/* --- Raw Unicode Escape Codec ------------------------------------------- */
6268
Alexander Belopolsky40018472011-02-26 01:02:56 +00006269PyObject *
6270PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006271 Py_ssize_t size,
6272 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006274 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006275 Py_ssize_t startinpos;
6276 Py_ssize_t endinpos;
6277 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006278 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 const char *end;
6280 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281 PyObject *errorHandler = NULL;
6282 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006283
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 /* Escaped strings will always be longer than the resulting
6285 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006286 length after conversion to the true value. (But decoding error
6287 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006288 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006292 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006293 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 end = s + size;
6295 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 unsigned char c;
6297 Py_UCS4 x;
6298 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006299 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300
Benjamin Peterson29060642009-01-31 22:14:21 +00006301 /* Non-escape characters are interpreted as Unicode ordinals */
6302 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006303 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6304 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006306 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 startinpos = s-starts;
6308
6309 /* \u-escapes are only interpreted iff the number of leading
6310 backslashes if odd */
6311 bs = s;
6312 for (;s < end;) {
6313 if (*s != '\\')
6314 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006315 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6316 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 }
6318 if (((s - bs) & 1) == 0 ||
6319 s >= end ||
6320 (*s != 'u' && *s != 'U')) {
6321 continue;
6322 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006323 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 count = *s=='u' ? 4 : 8;
6325 s++;
6326
6327 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 for (x = 0, i = 0; i < count; ++i, ++s) {
6329 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006330 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 endinpos = s-starts;
6332 if (unicode_decode_call_errorhandler(
6333 errors, &errorHandler,
6334 "rawunicodeescape", "truncated \\uXXXX",
6335 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006336 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 goto onError;
6338 goto nextByte;
6339 }
6340 x = (x<<4) & ~0xF;
6341 if (c >= '0' && c <= '9')
6342 x += c - '0';
6343 else if (c >= 'a' && c <= 'f')
6344 x += 10 + c - 'a';
6345 else
6346 x += 10 + c - 'A';
6347 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006348 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006349 if (unicode_putchar(&v, &outpos, x) < 0)
6350 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006351 } else {
6352 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006353 if (unicode_decode_call_errorhandler(
6354 errors, &errorHandler,
6355 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006357 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006359 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 nextByte:
6361 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006363 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365 Py_XDECREF(errorHandler);
6366 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006367 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006368
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371 Py_XDECREF(errorHandler);
6372 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 return NULL;
6374}
6375
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006376
Alexander Belopolsky40018472011-02-26 01:02:56 +00006377PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006378PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006380 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 char *p;
6382 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006383 Py_ssize_t expandsize, pos;
6384 int kind;
6385 void *data;
6386 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006388 if (!PyUnicode_Check(unicode)) {
6389 PyErr_BadArgument();
6390 return NULL;
6391 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006392 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006393 return NULL;
6394 kind = PyUnicode_KIND(unicode);
6395 data = PyUnicode_DATA(unicode);
6396 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006397 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6398 bytes, and 1 byte characters 4. */
6399 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006400
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006401 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006403
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006404 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 if (repr == NULL)
6406 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006407 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006408 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006410 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 for (pos = 0; pos < len; pos++) {
6412 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 /* Map 32-bit characters to '\Uxxxxxxxx' */
6414 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006415 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006416 *p++ = '\\';
6417 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006418 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6419 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6420 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6421 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6422 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6423 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6424 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6425 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006426 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006428 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 *p++ = '\\';
6430 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006431 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6432 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6433 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6434 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 /* Copy everything else as-is */
6437 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 *p++ = (char) ch;
6439 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006440
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006441 assert(p > q);
6442 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006443 return NULL;
6444 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445}
6446
Alexander Belopolsky40018472011-02-26 01:02:56 +00006447PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006448PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6449 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006451 PyObject *result;
6452 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6453 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006454 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006455 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6456 Py_DECREF(tmp);
6457 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458}
6459
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006460/* --- Unicode Internal Codec ------------------------------------------- */
6461
Alexander Belopolsky40018472011-02-26 01:02:56 +00006462PyObject *
6463_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006464 Py_ssize_t size,
6465 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006466{
6467 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006468 Py_ssize_t startinpos;
6469 Py_ssize_t endinpos;
6470 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006471 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006472 const char *end;
6473 const char *reason;
6474 PyObject *errorHandler = NULL;
6475 PyObject *exc = NULL;
6476
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006477 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006478 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006479 1))
6480 return NULL;
6481
Thomas Wouters89f507f2006-12-13 04:49:30 +00006482 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006483 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006484 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006486 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006487 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006488 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006489 end = s + size;
6490
6491 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006492 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006493 Py_UCS4 ch;
6494 /* We copy the raw representation one byte at a time because the
6495 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006496 ((char *) &uch)[0] = s[0];
6497 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006498#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006499 ((char *) &uch)[2] = s[2];
6500 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006501#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006502 ch = uch;
6503
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006504 /* We have to sanity check the raw data, otherwise doom looms for
6505 some malformed UCS-4 data. */
6506 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006507#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006508 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006509#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006510 end-s < Py_UNICODE_SIZE
6511 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006513 startinpos = s - starts;
6514 if (end-s < Py_UNICODE_SIZE) {
6515 endinpos = end-starts;
6516 reason = "truncated input";
6517 }
6518 else {
6519 endinpos = s - starts + Py_UNICODE_SIZE;
6520 reason = "illegal code point (> 0x10FFFF)";
6521 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006522 if (unicode_decode_call_errorhandler(
6523 errors, &errorHandler,
6524 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006525 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006526 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006527 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006528 continue;
6529 }
6530
6531 s += Py_UNICODE_SIZE;
6532#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006533 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006534 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006535 Py_UNICODE uch2;
6536 ((char *) &uch2)[0] = s[0];
6537 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006538 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006539 {
Victor Stinner551ac952011-11-29 22:58:13 +01006540 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006541 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006542 }
6543 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006544#endif
6545
6546 if (unicode_putchar(&v, &outpos, ch) < 0)
6547 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006548 }
6549
Victor Stinner16e6a802011-12-12 13:24:15 +01006550 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006551 goto onError;
6552 Py_XDECREF(errorHandler);
6553 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006554 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006555
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006557 Py_XDECREF(v);
6558 Py_XDECREF(errorHandler);
6559 Py_XDECREF(exc);
6560 return NULL;
6561}
6562
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563/* --- Latin-1 Codec ------------------------------------------------------ */
6564
Alexander Belopolsky40018472011-02-26 01:02:56 +00006565PyObject *
6566PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006567 Py_ssize_t size,
6568 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006571 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572}
6573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006574/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006575static void
6576make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006577 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006578 PyObject *unicode,
6579 Py_ssize_t startpos, Py_ssize_t endpos,
6580 const char *reason)
6581{
6582 if (*exceptionObject == NULL) {
6583 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006584 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006585 encoding, unicode, startpos, endpos, reason);
6586 }
6587 else {
6588 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6589 goto onError;
6590 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6591 goto onError;
6592 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6593 goto onError;
6594 return;
6595 onError:
6596 Py_DECREF(*exceptionObject);
6597 *exceptionObject = NULL;
6598 }
6599}
6600
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006601/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006602static void
6603raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006604 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006605 PyObject *unicode,
6606 Py_ssize_t startpos, Py_ssize_t endpos,
6607 const char *reason)
6608{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006609 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006610 encoding, unicode, startpos, endpos, reason);
6611 if (*exceptionObject != NULL)
6612 PyCodec_StrictErrors(*exceptionObject);
6613}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614
6615/* error handling callback helper:
6616 build arguments, call the callback and check the arguments,
6617 put the result into newpos and return the replacement string, which
6618 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006619static PyObject *
6620unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006621 PyObject **errorHandler,
6622 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006624 Py_ssize_t startpos, Py_ssize_t endpos,
6625 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006626{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006627 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006628 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006629 PyObject *restuple;
6630 PyObject *resunicode;
6631
6632 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006634 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636 }
6637
Benjamin Petersonbac79492012-01-14 13:34:47 -05006638 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006639 return NULL;
6640 len = PyUnicode_GET_LENGTH(unicode);
6641
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006642 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006643 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646
6647 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006648 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006652 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 Py_DECREF(restuple);
6654 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006655 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006656 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 &resunicode, newpos)) {
6658 Py_DECREF(restuple);
6659 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006660 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006661 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6662 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6663 Py_DECREF(restuple);
6664 return NULL;
6665 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006666 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006667 *newpos = len + *newpos;
6668 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6670 Py_DECREF(restuple);
6671 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006672 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006673 Py_INCREF(resunicode);
6674 Py_DECREF(restuple);
6675 return resunicode;
6676}
6677
Alexander Belopolsky40018472011-02-26 01:02:56 +00006678static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006680 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006681 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006683 /* input state */
6684 Py_ssize_t pos=0, size;
6685 int kind;
6686 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 /* output object */
6688 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689 /* pointer into the output */
6690 char *str;
6691 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006692 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006693 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6694 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 PyObject *errorHandler = NULL;
6696 PyObject *exc = NULL;
6697 /* the following variable is used for caching string comparisons
6698 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6699 int known_errorHandler = -1;
6700
Benjamin Petersonbac79492012-01-14 13:34:47 -05006701 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 return NULL;
6703 size = PyUnicode_GET_LENGTH(unicode);
6704 kind = PyUnicode_KIND(unicode);
6705 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006706 /* allocate enough for a simple encoding without
6707 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006708 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006709 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006710 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006711 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006712 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006713 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 ressize = size;
6715
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006716 while (pos < size) {
6717 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 /* can we encode this? */
6720 if (c<limit) {
6721 /* no overflow check, because we know that the space is enough */
6722 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006723 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006724 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 Py_ssize_t requiredsize;
6727 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006728 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006730 Py_ssize_t collstart = pos;
6731 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 ++collend;
6735 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6736 if (known_errorHandler==-1) {
6737 if ((errors==NULL) || (!strcmp(errors, "strict")))
6738 known_errorHandler = 1;
6739 else if (!strcmp(errors, "replace"))
6740 known_errorHandler = 2;
6741 else if (!strcmp(errors, "ignore"))
6742 known_errorHandler = 3;
6743 else if (!strcmp(errors, "xmlcharrefreplace"))
6744 known_errorHandler = 4;
6745 else
6746 known_errorHandler = 0;
6747 }
6748 switch (known_errorHandler) {
6749 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006750 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 goto onError;
6752 case 2: /* replace */
6753 while (collstart++<collend)
6754 *str++ = '?'; /* fall through */
6755 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 break;
6758 case 4: /* xmlcharrefreplace */
6759 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006760 /* determine replacement size */
6761 for (i = collstart, repsize = 0; i < collend; ++i) {
6762 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6763 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006765 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006767 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006769 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006771 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006773 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006775 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006776 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006778 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006780 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 if (requiredsize > ressize) {
6782 if (requiredsize<2*ressize)
6783 requiredsize = 2*ressize;
6784 if (_PyBytes_Resize(&res, requiredsize))
6785 goto onError;
6786 str = PyBytes_AS_STRING(res) + respos;
6787 ressize = requiredsize;
6788 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006789 /* generate replacement */
6790 for (i = collstart; i < collend; ++i) {
6791 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006793 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 break;
6795 default:
6796 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006797 encoding, reason, unicode, &exc,
6798 collstart, collend, &newpos);
6799 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006800 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006802 if (PyBytes_Check(repunicode)) {
6803 /* Directly copy bytes result to output. */
6804 repsize = PyBytes_Size(repunicode);
6805 if (repsize > 1) {
6806 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006807 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006808 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6809 Py_DECREF(repunicode);
6810 goto onError;
6811 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006812 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006813 ressize += repsize-1;
6814 }
6815 memcpy(str, PyBytes_AsString(repunicode), repsize);
6816 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006817 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006818 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006819 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006820 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 /* need more space? (at least enough for what we
6822 have+the replacement+the rest of the string, so
6823 we won't have to check space for encodable characters) */
6824 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006825 repsize = PyUnicode_GET_LENGTH(repunicode);
6826 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 if (requiredsize > ressize) {
6828 if (requiredsize<2*ressize)
6829 requiredsize = 2*ressize;
6830 if (_PyBytes_Resize(&res, requiredsize)) {
6831 Py_DECREF(repunicode);
6832 goto onError;
6833 }
6834 str = PyBytes_AS_STRING(res) + respos;
6835 ressize = requiredsize;
6836 }
6837 /* check if there is anything unencodable in the replacement
6838 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006839 for (i = 0; repsize-->0; ++i, ++str) {
6840 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006842 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006843 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 Py_DECREF(repunicode);
6845 goto onError;
6846 }
6847 *str = (char)c;
6848 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006849 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006850 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006851 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006852 }
6853 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006854 /* Resize if we allocated to much */
6855 size = str - PyBytes_AS_STRING(res);
6856 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006857 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006858 if (_PyBytes_Resize(&res, size) < 0)
6859 goto onError;
6860 }
6861
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006862 Py_XDECREF(errorHandler);
6863 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006864 return res;
6865
6866 onError:
6867 Py_XDECREF(res);
6868 Py_XDECREF(errorHandler);
6869 Py_XDECREF(exc);
6870 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006871}
6872
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006873/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006874PyObject *
6875PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006876 Py_ssize_t size,
6877 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006879 PyObject *result;
6880 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6881 if (unicode == NULL)
6882 return NULL;
6883 result = unicode_encode_ucs1(unicode, errors, 256);
6884 Py_DECREF(unicode);
6885 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886}
6887
Alexander Belopolsky40018472011-02-26 01:02:56 +00006888PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006889_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890{
6891 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 PyErr_BadArgument();
6893 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006895 if (PyUnicode_READY(unicode) == -1)
6896 return NULL;
6897 /* Fast path: if it is a one-byte string, construct
6898 bytes object directly. */
6899 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6900 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6901 PyUnicode_GET_LENGTH(unicode));
6902 /* Non-Latin-1 characters present. Defer to above function to
6903 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006904 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006905}
6906
6907PyObject*
6908PyUnicode_AsLatin1String(PyObject *unicode)
6909{
6910 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911}
6912
6913/* --- 7-bit ASCII Codec -------------------------------------------------- */
6914
Alexander Belopolsky40018472011-02-26 01:02:56 +00006915PyObject *
6916PyUnicode_DecodeASCII(const char *s,
6917 Py_ssize_t size,
6918 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006920 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006921 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006922 int kind;
6923 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006924 Py_ssize_t startinpos;
6925 Py_ssize_t endinpos;
6926 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006927 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006928 int has_error;
6929 const unsigned char *p = (const unsigned char *)s;
6930 const unsigned char *end = p + size;
6931 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006932 PyObject *errorHandler = NULL;
6933 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006934
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006935 if (size == 0) {
6936 Py_INCREF(unicode_empty);
6937 return unicode_empty;
6938 }
6939
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006941 if (size == 1 && (unsigned char)s[0] < 128)
6942 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006943
Victor Stinner702c7342011-10-05 13:50:52 +02006944 has_error = 0;
6945 while (p < end && !has_error) {
6946 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6947 an explanation. */
6948 if (!((size_t) p & LONG_PTR_MASK)) {
6949 /* Help register allocation */
6950 register const unsigned char *_p = p;
6951 while (_p < aligned_end) {
6952 unsigned long value = *(unsigned long *) _p;
6953 if (value & ASCII_CHAR_MASK) {
6954 has_error = 1;
6955 break;
6956 }
6957 _p += SIZEOF_LONG;
6958 }
6959 if (_p == end)
6960 break;
6961 if (has_error)
6962 break;
6963 p = _p;
6964 }
6965 if (*p & 0x80) {
6966 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006967 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006968 }
6969 else {
6970 ++p;
6971 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006972 }
Victor Stinner702c7342011-10-05 13:50:52 +02006973 if (!has_error)
6974 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006975
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006976 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006980 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006981 kind = PyUnicode_KIND(v);
6982 data = PyUnicode_DATA(v);
6983 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006984 e = s + size;
6985 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 register unsigned char c = (unsigned char)*s;
6987 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006988 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 ++s;
6990 }
6991 else {
6992 startinpos = s-starts;
6993 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 if (unicode_decode_call_errorhandler(
6995 errors, &errorHandler,
6996 "ascii", "ordinal not in range(128)",
6997 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006998 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007000 kind = PyUnicode_KIND(v);
7001 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007004 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007005 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007006 Py_XDECREF(errorHandler);
7007 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007008 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007009 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007010
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 Py_XDECREF(errorHandler);
7014 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015 return NULL;
7016}
7017
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007018/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007019PyObject *
7020PyUnicode_EncodeASCII(const Py_UNICODE *p,
7021 Py_ssize_t size,
7022 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007024 PyObject *result;
7025 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7026 if (unicode == NULL)
7027 return NULL;
7028 result = unicode_encode_ucs1(unicode, errors, 128);
7029 Py_DECREF(unicode);
7030 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031}
7032
Alexander Belopolsky40018472011-02-26 01:02:56 +00007033PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007034_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035{
7036 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 PyErr_BadArgument();
7038 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007040 if (PyUnicode_READY(unicode) == -1)
7041 return NULL;
7042 /* Fast path: if it is an ASCII-only string, construct bytes object
7043 directly. Else defer to above function to raise the exception. */
7044 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7045 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7046 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007047 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007048}
7049
7050PyObject *
7051PyUnicode_AsASCIIString(PyObject *unicode)
7052{
7053 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054}
7055
Victor Stinner99b95382011-07-04 14:23:54 +02007056#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007057
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007058/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007059
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007060#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007061#define NEED_RETRY
7062#endif
7063
Victor Stinner3a50e702011-10-18 21:21:00 +02007064#ifndef WC_ERR_INVALID_CHARS
7065# define WC_ERR_INVALID_CHARS 0x0080
7066#endif
7067
7068static char*
7069code_page_name(UINT code_page, PyObject **obj)
7070{
7071 *obj = NULL;
7072 if (code_page == CP_ACP)
7073 return "mbcs";
7074 if (code_page == CP_UTF7)
7075 return "CP_UTF7";
7076 if (code_page == CP_UTF8)
7077 return "CP_UTF8";
7078
7079 *obj = PyBytes_FromFormat("cp%u", code_page);
7080 if (*obj == NULL)
7081 return NULL;
7082 return PyBytes_AS_STRING(*obj);
7083}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084
Alexander Belopolsky40018472011-02-26 01:02:56 +00007085static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007086is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007087{
7088 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007089 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007090
Victor Stinner3a50e702011-10-18 21:21:00 +02007091 if (!IsDBCSLeadByteEx(code_page, *curr))
7092 return 0;
7093
7094 prev = CharPrevExA(code_page, s, curr, 0);
7095 if (prev == curr)
7096 return 1;
7097 /* FIXME: This code is limited to "true" double-byte encodings,
7098 as it assumes an incomplete character consists of a single
7099 byte. */
7100 if (curr - prev == 2)
7101 return 1;
7102 if (!IsDBCSLeadByteEx(code_page, *prev))
7103 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104 return 0;
7105}
7106
Victor Stinner3a50e702011-10-18 21:21:00 +02007107static DWORD
7108decode_code_page_flags(UINT code_page)
7109{
7110 if (code_page == CP_UTF7) {
7111 /* The CP_UTF7 decoder only supports flags=0 */
7112 return 0;
7113 }
7114 else
7115 return MB_ERR_INVALID_CHARS;
7116}
7117
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 * Decode a byte string from a Windows code page into unicode object in strict
7120 * mode.
7121 *
7122 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7123 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007124 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007125static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007126decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007127 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007128 const char *in,
7129 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130{
Victor Stinner3a50e702011-10-18 21:21:00 +02007131 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007132 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007133 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134
7135 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 assert(insize > 0);
7137 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7138 if (outsize <= 0)
7139 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007140
7141 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007143 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007144 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007145 if (*v == NULL)
7146 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148 }
7149 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007150 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007151 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007152 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007153 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007155 }
7156
7157 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007158 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7159 if (outsize <= 0)
7160 goto error;
7161 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007162
Victor Stinner3a50e702011-10-18 21:21:00 +02007163error:
7164 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7165 return -2;
7166 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007167 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007168}
7169
Victor Stinner3a50e702011-10-18 21:21:00 +02007170/*
7171 * Decode a byte string from a code page into unicode object with an error
7172 * handler.
7173 *
7174 * Returns consumed size if succeed, or raise a WindowsError or
7175 * UnicodeDecodeError exception and returns -1 on error.
7176 */
7177static int
7178decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007179 PyObject **v,
7180 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 const char *errors)
7182{
7183 const char *startin = in;
7184 const char *endin = in + size;
7185 const DWORD flags = decode_code_page_flags(code_page);
7186 /* Ideally, we should get reason from FormatMessage. This is the Windows
7187 2000 English version of the message. */
7188 const char *reason = "No mapping for the Unicode character exists "
7189 "in the target code page.";
7190 /* each step cannot decode more than 1 character, but a character can be
7191 represented as a surrogate pair */
7192 wchar_t buffer[2], *startout, *out;
7193 int insize, outsize;
7194 PyObject *errorHandler = NULL;
7195 PyObject *exc = NULL;
7196 PyObject *encoding_obj = NULL;
7197 char *encoding;
7198 DWORD err;
7199 int ret = -1;
7200
7201 assert(size > 0);
7202
7203 encoding = code_page_name(code_page, &encoding_obj);
7204 if (encoding == NULL)
7205 return -1;
7206
7207 if (errors == NULL || strcmp(errors, "strict") == 0) {
7208 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7209 UnicodeDecodeError. */
7210 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7211 if (exc != NULL) {
7212 PyCodec_StrictErrors(exc);
7213 Py_CLEAR(exc);
7214 }
7215 goto error;
7216 }
7217
7218 if (*v == NULL) {
7219 /* Create unicode object */
7220 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7221 PyErr_NoMemory();
7222 goto error;
7223 }
Victor Stinnerab595942011-12-17 04:59:06 +01007224 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007225 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007226 if (*v == NULL)
7227 goto error;
7228 startout = PyUnicode_AS_UNICODE(*v);
7229 }
7230 else {
7231 /* Extend unicode object */
7232 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7233 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7234 PyErr_NoMemory();
7235 goto error;
7236 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007237 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 goto error;
7239 startout = PyUnicode_AS_UNICODE(*v) + n;
7240 }
7241
7242 /* Decode the byte string character per character */
7243 out = startout;
7244 while (in < endin)
7245 {
7246 /* Decode a character */
7247 insize = 1;
7248 do
7249 {
7250 outsize = MultiByteToWideChar(code_page, flags,
7251 in, insize,
7252 buffer, Py_ARRAY_LENGTH(buffer));
7253 if (outsize > 0)
7254 break;
7255 err = GetLastError();
7256 if (err != ERROR_NO_UNICODE_TRANSLATION
7257 && err != ERROR_INSUFFICIENT_BUFFER)
7258 {
7259 PyErr_SetFromWindowsErr(0);
7260 goto error;
7261 }
7262 insize++;
7263 }
7264 /* 4=maximum length of a UTF-8 sequence */
7265 while (insize <= 4 && (in + insize) <= endin);
7266
7267 if (outsize <= 0) {
7268 Py_ssize_t startinpos, endinpos, outpos;
7269
7270 startinpos = in - startin;
7271 endinpos = startinpos + 1;
7272 outpos = out - PyUnicode_AS_UNICODE(*v);
7273 if (unicode_decode_call_errorhandler(
7274 errors, &errorHandler,
7275 encoding, reason,
7276 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007277 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 {
7279 goto error;
7280 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007281 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007282 }
7283 else {
7284 in += insize;
7285 memcpy(out, buffer, outsize * sizeof(wchar_t));
7286 out += outsize;
7287 }
7288 }
7289
7290 /* write a NUL character at the end */
7291 *out = 0;
7292
7293 /* Extend unicode object */
7294 outsize = out - startout;
7295 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007296 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007297 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007298 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007299
7300error:
7301 Py_XDECREF(encoding_obj);
7302 Py_XDECREF(errorHandler);
7303 Py_XDECREF(exc);
7304 return ret;
7305}
7306
Victor Stinner3a50e702011-10-18 21:21:00 +02007307static PyObject *
7308decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007309 const char *s, Py_ssize_t size,
7310 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007311{
Victor Stinner76a31a62011-11-04 00:05:13 +01007312 PyObject *v = NULL;
7313 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314
Victor Stinner3a50e702011-10-18 21:21:00 +02007315 if (code_page < 0) {
7316 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7317 return NULL;
7318 }
7319
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007322
Victor Stinner76a31a62011-11-04 00:05:13 +01007323 do
7324 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007326 if (size > INT_MAX) {
7327 chunk_size = INT_MAX;
7328 final = 0;
7329 done = 0;
7330 }
7331 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007333 {
7334 chunk_size = (int)size;
7335 final = (consumed == NULL);
7336 done = 1;
7337 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338
Victor Stinner76a31a62011-11-04 00:05:13 +01007339 /* Skip trailing lead-byte unless 'final' is set */
7340 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7341 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007342
Victor Stinner76a31a62011-11-04 00:05:13 +01007343 if (chunk_size == 0 && done) {
7344 if (v != NULL)
7345 break;
7346 Py_INCREF(unicode_empty);
7347 return unicode_empty;
7348 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007349
Victor Stinner76a31a62011-11-04 00:05:13 +01007350
7351 converted = decode_code_page_strict(code_page, &v,
7352 s, chunk_size);
7353 if (converted == -2)
7354 converted = decode_code_page_errors(code_page, &v,
7355 s, chunk_size,
7356 errors);
7357 assert(converted != 0);
7358
7359 if (converted < 0) {
7360 Py_XDECREF(v);
7361 return NULL;
7362 }
7363
7364 if (consumed)
7365 *consumed += converted;
7366
7367 s += converted;
7368 size -= converted;
7369 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007370
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007371 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007372}
7373
Alexander Belopolsky40018472011-02-26 01:02:56 +00007374PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007375PyUnicode_DecodeCodePageStateful(int code_page,
7376 const char *s,
7377 Py_ssize_t size,
7378 const char *errors,
7379 Py_ssize_t *consumed)
7380{
7381 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7382}
7383
7384PyObject *
7385PyUnicode_DecodeMBCSStateful(const char *s,
7386 Py_ssize_t size,
7387 const char *errors,
7388 Py_ssize_t *consumed)
7389{
7390 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7391}
7392
7393PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007394PyUnicode_DecodeMBCS(const char *s,
7395 Py_ssize_t size,
7396 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007397{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007398 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7399}
7400
Victor Stinner3a50e702011-10-18 21:21:00 +02007401static DWORD
7402encode_code_page_flags(UINT code_page, const char *errors)
7403{
7404 if (code_page == CP_UTF8) {
7405 if (winver.dwMajorVersion >= 6)
7406 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7407 and later */
7408 return WC_ERR_INVALID_CHARS;
7409 else
7410 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7411 return 0;
7412 }
7413 else if (code_page == CP_UTF7) {
7414 /* CP_UTF7 only supports flags=0 */
7415 return 0;
7416 }
7417 else {
7418 if (errors != NULL && strcmp(errors, "replace") == 0)
7419 return 0;
7420 else
7421 return WC_NO_BEST_FIT_CHARS;
7422 }
7423}
7424
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007425/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 * Encode a Unicode string to a Windows code page into a byte string in strict
7427 * mode.
7428 *
7429 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7430 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007431 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007432static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007433encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007434 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007436{
Victor Stinner554f3f02010-06-16 23:33:54 +00007437 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 BOOL *pusedDefaultChar = &usedDefaultChar;
7439 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007440 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007441 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007442 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 const DWORD flags = encode_code_page_flags(code_page, NULL);
7444 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007445 /* Create a substring so that we can get the UTF-16 representation
7446 of just the slice under consideration. */
7447 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448
Martin v. Löwis3d325192011-11-04 18:23:06 +01007449 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007450
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007452 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007454 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007455
Victor Stinner2fc507f2011-11-04 20:06:39 +01007456 substring = PyUnicode_Substring(unicode, offset, offset+len);
7457 if (substring == NULL)
7458 return -1;
7459 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7460 if (p == NULL) {
7461 Py_DECREF(substring);
7462 return -1;
7463 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007464
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007465 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 outsize = WideCharToMultiByte(code_page, flags,
7467 p, size,
7468 NULL, 0,
7469 NULL, pusedDefaultChar);
7470 if (outsize <= 0)
7471 goto error;
7472 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007473 if (pusedDefaultChar && *pusedDefaultChar) {
7474 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007476 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007477
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007481 if (*outbytes == NULL) {
7482 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007484 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007486 }
7487 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 const Py_ssize_t n = PyBytes_Size(*outbytes);
7490 if (outsize > PY_SSIZE_T_MAX - n) {
7491 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007492 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007495 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7496 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007498 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007500 }
7501
7502 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 outsize = WideCharToMultiByte(code_page, flags,
7504 p, size,
7505 out, outsize,
7506 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007507 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 if (outsize <= 0)
7509 goto error;
7510 if (pusedDefaultChar && *pusedDefaultChar)
7511 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007512 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007513
Victor Stinner3a50e702011-10-18 21:21:00 +02007514error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007515 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7517 return -2;
7518 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007519 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007520}
7521
Victor Stinner3a50e702011-10-18 21:21:00 +02007522/*
7523 * Encode a Unicode string to a Windows code page into a byte string using a
7524 * error handler.
7525 *
7526 * Returns consumed characters if succeed, or raise a WindowsError and returns
7527 * -1 on other error.
7528 */
7529static int
7530encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007531 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007532 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007533{
Victor Stinner3a50e702011-10-18 21:21:00 +02007534 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007535 Py_ssize_t pos = unicode_offset;
7536 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007537 /* Ideally, we should get reason from FormatMessage. This is the Windows
7538 2000 English version of the message. */
7539 const char *reason = "invalid character";
7540 /* 4=maximum length of a UTF-8 sequence */
7541 char buffer[4];
7542 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7543 Py_ssize_t outsize;
7544 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 PyObject *errorHandler = NULL;
7546 PyObject *exc = NULL;
7547 PyObject *encoding_obj = NULL;
7548 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007549 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007550 PyObject *rep;
7551 int ret = -1;
7552
7553 assert(insize > 0);
7554
7555 encoding = code_page_name(code_page, &encoding_obj);
7556 if (encoding == NULL)
7557 return -1;
7558
7559 if (errors == NULL || strcmp(errors, "strict") == 0) {
7560 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7561 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007562 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 if (exc != NULL) {
7564 PyCodec_StrictErrors(exc);
7565 Py_DECREF(exc);
7566 }
7567 Py_XDECREF(encoding_obj);
7568 return -1;
7569 }
7570
7571 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7572 pusedDefaultChar = &usedDefaultChar;
7573 else
7574 pusedDefaultChar = NULL;
7575
7576 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7577 PyErr_NoMemory();
7578 goto error;
7579 }
7580 outsize = insize * Py_ARRAY_LENGTH(buffer);
7581
7582 if (*outbytes == NULL) {
7583 /* Create string object */
7584 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7585 if (*outbytes == NULL)
7586 goto error;
7587 out = PyBytes_AS_STRING(*outbytes);
7588 }
7589 else {
7590 /* Extend string object */
7591 Py_ssize_t n = PyBytes_Size(*outbytes);
7592 if (n > PY_SSIZE_T_MAX - outsize) {
7593 PyErr_NoMemory();
7594 goto error;
7595 }
7596 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7597 goto error;
7598 out = PyBytes_AS_STRING(*outbytes) + n;
7599 }
7600
7601 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007602 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007604 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7605 wchar_t chars[2];
7606 int charsize;
7607 if (ch < 0x10000) {
7608 chars[0] = (wchar_t)ch;
7609 charsize = 1;
7610 }
7611 else {
7612 ch -= 0x10000;
7613 chars[0] = 0xd800 + (ch >> 10);
7614 chars[1] = 0xdc00 + (ch & 0x3ff);
7615 charsize = 2;
7616 }
7617
Victor Stinner3a50e702011-10-18 21:21:00 +02007618 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007619 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007620 buffer, Py_ARRAY_LENGTH(buffer),
7621 NULL, pusedDefaultChar);
7622 if (outsize > 0) {
7623 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7624 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007625 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 memcpy(out, buffer, outsize);
7627 out += outsize;
7628 continue;
7629 }
7630 }
7631 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7632 PyErr_SetFromWindowsErr(0);
7633 goto error;
7634 }
7635
Victor Stinner3a50e702011-10-18 21:21:00 +02007636 rep = unicode_encode_call_errorhandler(
7637 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007638 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007639 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 if (rep == NULL)
7641 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007642 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007643
7644 if (PyBytes_Check(rep)) {
7645 outsize = PyBytes_GET_SIZE(rep);
7646 if (outsize != 1) {
7647 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7648 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7649 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7650 Py_DECREF(rep);
7651 goto error;
7652 }
7653 out = PyBytes_AS_STRING(*outbytes) + offset;
7654 }
7655 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7656 out += outsize;
7657 }
7658 else {
7659 Py_ssize_t i;
7660 enum PyUnicode_Kind kind;
7661 void *data;
7662
Benjamin Petersonbac79492012-01-14 13:34:47 -05007663 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007664 Py_DECREF(rep);
7665 goto error;
7666 }
7667
7668 outsize = PyUnicode_GET_LENGTH(rep);
7669 if (outsize != 1) {
7670 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7671 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7672 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7673 Py_DECREF(rep);
7674 goto error;
7675 }
7676 out = PyBytes_AS_STRING(*outbytes) + offset;
7677 }
7678 kind = PyUnicode_KIND(rep);
7679 data = PyUnicode_DATA(rep);
7680 for (i=0; i < outsize; i++) {
7681 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7682 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007683 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007684 encoding, unicode,
7685 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007686 "unable to encode error handler result to ASCII");
7687 Py_DECREF(rep);
7688 goto error;
7689 }
7690 *out = (unsigned char)ch;
7691 out++;
7692 }
7693 }
7694 Py_DECREF(rep);
7695 }
7696 /* write a NUL byte */
7697 *out = 0;
7698 outsize = out - PyBytes_AS_STRING(*outbytes);
7699 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7700 if (_PyBytes_Resize(outbytes, outsize) < 0)
7701 goto error;
7702 ret = 0;
7703
7704error:
7705 Py_XDECREF(encoding_obj);
7706 Py_XDECREF(errorHandler);
7707 Py_XDECREF(exc);
7708 return ret;
7709}
7710
Victor Stinner3a50e702011-10-18 21:21:00 +02007711static PyObject *
7712encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007713 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007714 const char *errors)
7715{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007716 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007717 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007718 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007719 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007720
Benjamin Petersonbac79492012-01-14 13:34:47 -05007721 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007722 return NULL;
7723 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007724
Victor Stinner3a50e702011-10-18 21:21:00 +02007725 if (code_page < 0) {
7726 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7727 return NULL;
7728 }
7729
Martin v. Löwis3d325192011-11-04 18:23:06 +01007730 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007731 return PyBytes_FromStringAndSize(NULL, 0);
7732
Victor Stinner7581cef2011-11-03 22:32:33 +01007733 offset = 0;
7734 do
7735 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007736#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007737 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007738 chunks. */
7739 if (len > INT_MAX/2) {
7740 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007741 done = 0;
7742 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007743 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007744#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007745 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007746 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007747 done = 1;
7748 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007749
Victor Stinner76a31a62011-11-04 00:05:13 +01007750 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007751 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007752 errors);
7753 if (ret == -2)
7754 ret = encode_code_page_errors(code_page, &outbytes,
7755 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007756 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007757 if (ret < 0) {
7758 Py_XDECREF(outbytes);
7759 return NULL;
7760 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007761
Victor Stinner7581cef2011-11-03 22:32:33 +01007762 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007763 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007764 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007765
Victor Stinner3a50e702011-10-18 21:21:00 +02007766 return outbytes;
7767}
7768
7769PyObject *
7770PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7771 Py_ssize_t size,
7772 const char *errors)
7773{
Victor Stinner7581cef2011-11-03 22:32:33 +01007774 PyObject *unicode, *res;
7775 unicode = PyUnicode_FromUnicode(p, size);
7776 if (unicode == NULL)
7777 return NULL;
7778 res = encode_code_page(CP_ACP, unicode, errors);
7779 Py_DECREF(unicode);
7780 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007781}
7782
7783PyObject *
7784PyUnicode_EncodeCodePage(int code_page,
7785 PyObject *unicode,
7786 const char *errors)
7787{
Victor Stinner7581cef2011-11-03 22:32:33 +01007788 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007789}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007790
Alexander Belopolsky40018472011-02-26 01:02:56 +00007791PyObject *
7792PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007793{
7794 if (!PyUnicode_Check(unicode)) {
7795 PyErr_BadArgument();
7796 return NULL;
7797 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007798 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007799}
7800
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007801#undef NEED_RETRY
7802
Victor Stinner99b95382011-07-04 14:23:54 +02007803#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007804
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805/* --- Character Mapping Codec -------------------------------------------- */
7806
Alexander Belopolsky40018472011-02-26 01:02:56 +00007807PyObject *
7808PyUnicode_DecodeCharmap(const char *s,
7809 Py_ssize_t size,
7810 PyObject *mapping,
7811 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007813 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007814 Py_ssize_t startinpos;
7815 Py_ssize_t endinpos;
7816 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007817 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007818 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007819 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007820 PyObject *errorHandler = NULL;
7821 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007822
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 /* Default to Latin-1 */
7824 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007827 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007829 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007831 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007832 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007833 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007834 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007835 Py_ssize_t maplen;
7836 enum PyUnicode_Kind kind;
7837 void *data;
7838 Py_UCS4 x;
7839
Benjamin Petersonbac79492012-01-14 13:34:47 -05007840 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007841 return NULL;
7842
7843 maplen = PyUnicode_GET_LENGTH(mapping);
7844 data = PyUnicode_DATA(mapping);
7845 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 while (s < e) {
7847 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007850 x = PyUnicode_READ(kind, data, ch);
7851 else
7852 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007854 if (x == 0xfffe)
7855 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 startinpos = s-starts;
7858 endinpos = startinpos+1;
7859 if (unicode_decode_call_errorhandler(
7860 errors, &errorHandler,
7861 "charmap", "character maps to <undefined>",
7862 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007863 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 goto onError;
7865 }
7866 continue;
7867 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007868
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007869 if (unicode_putchar(&v, &outpos, x) < 0)
7870 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007871 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007872 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007873 }
7874 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 while (s < e) {
7876 unsigned char ch = *s;
7877 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007878
Benjamin Peterson29060642009-01-31 22:14:21 +00007879 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7880 w = PyLong_FromLong((long)ch);
7881 if (w == NULL)
7882 goto onError;
7883 x = PyObject_GetItem(mapping, w);
7884 Py_DECREF(w);
7885 if (x == NULL) {
7886 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7887 /* No mapping found means: mapping is undefined. */
7888 PyErr_Clear();
7889 x = Py_None;
7890 Py_INCREF(x);
7891 } else
7892 goto onError;
7893 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007894
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 /* Apply mapping */
7896 if (PyLong_Check(x)) {
7897 long value = PyLong_AS_LONG(x);
7898 if (value < 0 || value > 65535) {
7899 PyErr_SetString(PyExc_TypeError,
7900 "character mapping must be in range(65536)");
7901 Py_DECREF(x);
7902 goto onError;
7903 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007904 if (unicode_putchar(&v, &outpos, value) < 0)
7905 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 }
7907 else if (x == Py_None) {
7908 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 startinpos = s-starts;
7910 endinpos = startinpos+1;
7911 if (unicode_decode_call_errorhandler(
7912 errors, &errorHandler,
7913 "charmap", "character maps to <undefined>",
7914 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007915 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 Py_DECREF(x);
7917 goto onError;
7918 }
7919 Py_DECREF(x);
7920 continue;
7921 }
7922 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007923 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007924
Benjamin Petersonbac79492012-01-14 13:34:47 -05007925 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007926 goto onError;
7927 targetsize = PyUnicode_GET_LENGTH(x);
7928
7929 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007931 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007932 PyUnicode_READ_CHAR(x, 0)) < 0)
7933 goto onError;
7934 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 else if (targetsize > 1) {
7936 /* 1-n mapping */
7937 if (targetsize > extrachars) {
7938 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 Py_ssize_t needed = (targetsize - extrachars) + \
7940 (targetsize << 2);
7941 extrachars += needed;
7942 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007943 if (unicode_resize(&v,
7944 PyUnicode_GET_LENGTH(v) + needed) < 0)
7945 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 Py_DECREF(x);
7947 goto onError;
7948 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007950 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7951 goto onError;
7952 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7953 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 extrachars -= targetsize;
7955 }
7956 /* 1-0 mapping: skip the character */
7957 }
7958 else {
7959 /* wrong return value */
7960 PyErr_SetString(PyExc_TypeError,
7961 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007962 Py_DECREF(x);
7963 goto onError;
7964 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 Py_DECREF(x);
7966 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007969 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007970 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007971 Py_XDECREF(errorHandler);
7972 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007973 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007974
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976 Py_XDECREF(errorHandler);
7977 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978 Py_XDECREF(v);
7979 return NULL;
7980}
7981
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007982/* Charmap encoding: the lookup table */
7983
Alexander Belopolsky40018472011-02-26 01:02:56 +00007984struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 PyObject_HEAD
7986 unsigned char level1[32];
7987 int count2, count3;
7988 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007989};
7990
7991static PyObject*
7992encoding_map_size(PyObject *obj, PyObject* args)
7993{
7994 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007995 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007997}
7998
7999static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008000 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 PyDoc_STR("Return the size (in bytes) of this object") },
8002 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008003};
8004
8005static void
8006encoding_map_dealloc(PyObject* o)
8007{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008008 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008009}
8010
8011static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008012 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 "EncodingMap", /*tp_name*/
8014 sizeof(struct encoding_map), /*tp_basicsize*/
8015 0, /*tp_itemsize*/
8016 /* methods */
8017 encoding_map_dealloc, /*tp_dealloc*/
8018 0, /*tp_print*/
8019 0, /*tp_getattr*/
8020 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008021 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 0, /*tp_repr*/
8023 0, /*tp_as_number*/
8024 0, /*tp_as_sequence*/
8025 0, /*tp_as_mapping*/
8026 0, /*tp_hash*/
8027 0, /*tp_call*/
8028 0, /*tp_str*/
8029 0, /*tp_getattro*/
8030 0, /*tp_setattro*/
8031 0, /*tp_as_buffer*/
8032 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8033 0, /*tp_doc*/
8034 0, /*tp_traverse*/
8035 0, /*tp_clear*/
8036 0, /*tp_richcompare*/
8037 0, /*tp_weaklistoffset*/
8038 0, /*tp_iter*/
8039 0, /*tp_iternext*/
8040 encoding_map_methods, /*tp_methods*/
8041 0, /*tp_members*/
8042 0, /*tp_getset*/
8043 0, /*tp_base*/
8044 0, /*tp_dict*/
8045 0, /*tp_descr_get*/
8046 0, /*tp_descr_set*/
8047 0, /*tp_dictoffset*/
8048 0, /*tp_init*/
8049 0, /*tp_alloc*/
8050 0, /*tp_new*/
8051 0, /*tp_free*/
8052 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053};
8054
8055PyObject*
8056PyUnicode_BuildEncodingMap(PyObject* string)
8057{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008058 PyObject *result;
8059 struct encoding_map *mresult;
8060 int i;
8061 int need_dict = 0;
8062 unsigned char level1[32];
8063 unsigned char level2[512];
8064 unsigned char *mlevel1, *mlevel2, *mlevel3;
8065 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008066 int kind;
8067 void *data;
8068 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008070 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008071 PyErr_BadArgument();
8072 return NULL;
8073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008074 kind = PyUnicode_KIND(string);
8075 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008076 memset(level1, 0xFF, sizeof level1);
8077 memset(level2, 0xFF, sizeof level2);
8078
8079 /* If there isn't a one-to-one mapping of NULL to \0,
8080 or if there are non-BMP characters, we need to use
8081 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008082 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083 need_dict = 1;
8084 for (i = 1; i < 256; i++) {
8085 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008086 ch = PyUnicode_READ(kind, data, i);
8087 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008088 need_dict = 1;
8089 break;
8090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008091 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008092 /* unmapped character */
8093 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008094 l1 = ch >> 11;
8095 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008096 if (level1[l1] == 0xFF)
8097 level1[l1] = count2++;
8098 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008099 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100 }
8101
8102 if (count2 >= 0xFF || count3 >= 0xFF)
8103 need_dict = 1;
8104
8105 if (need_dict) {
8106 PyObject *result = PyDict_New();
8107 PyObject *key, *value;
8108 if (!result)
8109 return NULL;
8110 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008112 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008113 if (!key || !value)
8114 goto failed1;
8115 if (PyDict_SetItem(result, key, value) == -1)
8116 goto failed1;
8117 Py_DECREF(key);
8118 Py_DECREF(value);
8119 }
8120 return result;
8121 failed1:
8122 Py_XDECREF(key);
8123 Py_XDECREF(value);
8124 Py_DECREF(result);
8125 return NULL;
8126 }
8127
8128 /* Create a three-level trie */
8129 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8130 16*count2 + 128*count3 - 1);
8131 if (!result)
8132 return PyErr_NoMemory();
8133 PyObject_Init(result, &EncodingMapType);
8134 mresult = (struct encoding_map*)result;
8135 mresult->count2 = count2;
8136 mresult->count3 = count3;
8137 mlevel1 = mresult->level1;
8138 mlevel2 = mresult->level23;
8139 mlevel3 = mresult->level23 + 16*count2;
8140 memcpy(mlevel1, level1, 32);
8141 memset(mlevel2, 0xFF, 16*count2);
8142 memset(mlevel3, 0, 128*count3);
8143 count3 = 0;
8144 for (i = 1; i < 256; i++) {
8145 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008146 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008147 /* unmapped character */
8148 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008149 o1 = PyUnicode_READ(kind, data, i)>>11;
8150 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008151 i2 = 16*mlevel1[o1] + o2;
8152 if (mlevel2[i2] == 0xFF)
8153 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008154 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155 i3 = 128*mlevel2[i2] + o3;
8156 mlevel3[i3] = i;
8157 }
8158 return result;
8159}
8160
8161static int
Victor Stinner22168992011-11-20 17:09:18 +01008162encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163{
8164 struct encoding_map *map = (struct encoding_map*)mapping;
8165 int l1 = c>>11;
8166 int l2 = (c>>7) & 0xF;
8167 int l3 = c & 0x7F;
8168 int i;
8169
Victor Stinner22168992011-11-20 17:09:18 +01008170 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172 if (c == 0)
8173 return 0;
8174 /* level 1*/
8175 i = map->level1[l1];
8176 if (i == 0xFF) {
8177 return -1;
8178 }
8179 /* level 2*/
8180 i = map->level23[16*i+l2];
8181 if (i == 0xFF) {
8182 return -1;
8183 }
8184 /* level 3 */
8185 i = map->level23[16*map->count2 + 128*i + l3];
8186 if (i == 0) {
8187 return -1;
8188 }
8189 return i;
8190}
8191
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008192/* Lookup the character ch in the mapping. If the character
8193 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008194 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008195static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008196charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197{
Christian Heimes217cfd12007-12-02 14:31:20 +00008198 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008199 PyObject *x;
8200
8201 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008203 x = PyObject_GetItem(mapping, w);
8204 Py_DECREF(w);
8205 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8207 /* No mapping found means: mapping is undefined. */
8208 PyErr_Clear();
8209 x = Py_None;
8210 Py_INCREF(x);
8211 return x;
8212 } else
8213 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008215 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008217 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 long value = PyLong_AS_LONG(x);
8219 if (value < 0 || value > 255) {
8220 PyErr_SetString(PyExc_TypeError,
8221 "character mapping must be in range(256)");
8222 Py_DECREF(x);
8223 return NULL;
8224 }
8225 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008227 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 /* wrong return value */
8231 PyErr_Format(PyExc_TypeError,
8232 "character mapping must return integer, bytes or None, not %.400s",
8233 x->ob_type->tp_name);
8234 Py_DECREF(x);
8235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 }
8237}
8238
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008239static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008240charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008241{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008242 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8243 /* exponentially overallocate to minimize reallocations */
8244 if (requiredsize < 2*outsize)
8245 requiredsize = 2*outsize;
8246 if (_PyBytes_Resize(outobj, requiredsize))
8247 return -1;
8248 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008249}
8250
Benjamin Peterson14339b62009-01-31 16:36:08 +00008251typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008253} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008255 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008256 space is available. Return a new reference to the object that
8257 was put in the output buffer, or Py_None, if the mapping was undefined
8258 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008259 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008260static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008261charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008262 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008264 PyObject *rep;
8265 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008266 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008267
Christian Heimes90aa7642007-12-19 02:45:37 +00008268 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008269 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008271 if (res == -1)
8272 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 if (outsize<requiredsize)
8274 if (charmapencode_resize(outobj, outpos, requiredsize))
8275 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008276 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 outstart[(*outpos)++] = (char)res;
8278 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008279 }
8280
8281 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008284 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 Py_DECREF(rep);
8286 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 if (PyLong_Check(rep)) {
8289 Py_ssize_t requiredsize = *outpos+1;
8290 if (outsize<requiredsize)
8291 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8292 Py_DECREF(rep);
8293 return enc_EXCEPTION;
8294 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008295 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008297 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 else {
8299 const char *repchars = PyBytes_AS_STRING(rep);
8300 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8301 Py_ssize_t requiredsize = *outpos+repsize;
8302 if (outsize<requiredsize)
8303 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8304 Py_DECREF(rep);
8305 return enc_EXCEPTION;
8306 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008307 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 memcpy(outstart + *outpos, repchars, repsize);
8309 *outpos += repsize;
8310 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008311 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312 Py_DECREF(rep);
8313 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008314}
8315
8316/* handle an error in PyUnicode_EncodeCharmap
8317 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008318static int
8319charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008320 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008322 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008323 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324{
8325 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008326 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008327 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008328 enum PyUnicode_Kind kind;
8329 void *data;
8330 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008332 Py_ssize_t collstartpos = *inpos;
8333 Py_ssize_t collendpos = *inpos+1;
8334 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335 char *encoding = "charmap";
8336 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008338 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008339 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340
Benjamin Petersonbac79492012-01-14 13:34:47 -05008341 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008342 return -1;
8343 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344 /* find all unencodable characters */
8345 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008346 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008347 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008348 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008349 val = encoding_map_lookup(ch, mapping);
8350 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 break;
8352 ++collendpos;
8353 continue;
8354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008355
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008356 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8357 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 if (rep==NULL)
8359 return -1;
8360 else if (rep!=Py_None) {
8361 Py_DECREF(rep);
8362 break;
8363 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008364 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366 }
8367 /* cache callback name lookup
8368 * (if not done yet, i.e. it's the first error) */
8369 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 if ((errors==NULL) || (!strcmp(errors, "strict")))
8371 *known_errorHandler = 1;
8372 else if (!strcmp(errors, "replace"))
8373 *known_errorHandler = 2;
8374 else if (!strcmp(errors, "ignore"))
8375 *known_errorHandler = 3;
8376 else if (!strcmp(errors, "xmlcharrefreplace"))
8377 *known_errorHandler = 4;
8378 else
8379 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 }
8381 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008382 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008383 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008384 return -1;
8385 case 2: /* replace */
8386 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 x = charmapencode_output('?', mapping, res, respos);
8388 if (x==enc_EXCEPTION) {
8389 return -1;
8390 }
8391 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008392 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 return -1;
8394 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008395 }
8396 /* fall through */
8397 case 3: /* ignore */
8398 *inpos = collendpos;
8399 break;
8400 case 4: /* xmlcharrefreplace */
8401 /* generate replacement (temporarily (mis)uses p) */
8402 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 char buffer[2+29+1+1];
8404 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008405 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 for (cp = buffer; *cp; ++cp) {
8407 x = charmapencode_output(*cp, mapping, res, respos);
8408 if (x==enc_EXCEPTION)
8409 return -1;
8410 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008411 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 return -1;
8413 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008414 }
8415 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008416 *inpos = collendpos;
8417 break;
8418 default:
8419 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008420 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008422 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008424 if (PyBytes_Check(repunicode)) {
8425 /* Directly copy bytes result to output. */
8426 Py_ssize_t outsize = PyBytes_Size(*res);
8427 Py_ssize_t requiredsize;
8428 repsize = PyBytes_Size(repunicode);
8429 requiredsize = *respos + repsize;
8430 if (requiredsize > outsize)
8431 /* Make room for all additional bytes. */
8432 if (charmapencode_resize(res, respos, requiredsize)) {
8433 Py_DECREF(repunicode);
8434 return -1;
8435 }
8436 memcpy(PyBytes_AsString(*res) + *respos,
8437 PyBytes_AsString(repunicode), repsize);
8438 *respos += repsize;
8439 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008440 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008441 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008442 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008443 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008444 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008445 Py_DECREF(repunicode);
8446 return -1;
8447 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008448 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008449 data = PyUnicode_DATA(repunicode);
8450 kind = PyUnicode_KIND(repunicode);
8451 for (index = 0; index < repsize; index++) {
8452 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8453 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008455 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 return -1;
8457 }
8458 else if (x==enc_FAILED) {
8459 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008460 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 return -1;
8462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 }
8464 *inpos = newpos;
8465 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 }
8467 return 0;
8468}
8469
Alexander Belopolsky40018472011-02-26 01:02:56 +00008470PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008471_PyUnicode_EncodeCharmap(PyObject *unicode,
8472 PyObject *mapping,
8473 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 /* output object */
8476 PyObject *res = NULL;
8477 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008478 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008479 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008481 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482 PyObject *errorHandler = NULL;
8483 PyObject *exc = NULL;
8484 /* the following variable is used for caching string comparisons
8485 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8486 * 3=ignore, 4=xmlcharrefreplace */
8487 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488
Benjamin Petersonbac79492012-01-14 13:34:47 -05008489 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008490 return NULL;
8491 size = PyUnicode_GET_LENGTH(unicode);
8492
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493 /* Default to Latin-1 */
8494 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008495 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497 /* allocate enough for a simple encoding without
8498 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008499 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008500 if (res == NULL)
8501 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008502 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008506 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008508 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 if (x==enc_EXCEPTION) /* error */
8510 goto onError;
8511 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008512 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 &exc,
8514 &known_errorHandler, &errorHandler, errors,
8515 &res, &respos)) {
8516 goto onError;
8517 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008518 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 else
8520 /* done with this character => adjust input position */
8521 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008525 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008526 if (_PyBytes_Resize(&res, respos) < 0)
8527 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008528
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008529 Py_XDECREF(exc);
8530 Py_XDECREF(errorHandler);
8531 return res;
8532
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 Py_XDECREF(res);
8535 Py_XDECREF(exc);
8536 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537 return NULL;
8538}
8539
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008540/* Deprecated */
8541PyObject *
8542PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8543 Py_ssize_t size,
8544 PyObject *mapping,
8545 const char *errors)
8546{
8547 PyObject *result;
8548 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8549 if (unicode == NULL)
8550 return NULL;
8551 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8552 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008553 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554}
8555
Alexander Belopolsky40018472011-02-26 01:02:56 +00008556PyObject *
8557PyUnicode_AsCharmapString(PyObject *unicode,
8558 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559{
8560 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 PyErr_BadArgument();
8562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008564 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565}
8566
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008568static void
8569make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008571 Py_ssize_t startpos, Py_ssize_t endpos,
8572 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 *exceptionObject = _PyUnicodeTranslateError_Create(
8576 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 }
8578 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8580 goto onError;
8581 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8582 goto onError;
8583 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8584 goto onError;
8585 return;
8586 onError:
8587 Py_DECREF(*exceptionObject);
8588 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 }
8590}
8591
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008593static void
8594raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008596 Py_ssize_t startpos, Py_ssize_t endpos,
8597 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598{
8599 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008601 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008603}
8604
8605/* error handling callback helper:
8606 build arguments, call the callback and check the arguments,
8607 put the result into newpos and return the replacement string, which
8608 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008609static PyObject *
8610unicode_translate_call_errorhandler(const char *errors,
8611 PyObject **errorHandler,
8612 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008614 Py_ssize_t startpos, Py_ssize_t endpos,
8615 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008617 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008619 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 PyObject *restuple;
8621 PyObject *resunicode;
8622
8623 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008627 }
8628
8629 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633
8634 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008636 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008639 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 Py_DECREF(restuple);
8641 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642 }
8643 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 &resunicode, &i_newpos)) {
8645 Py_DECREF(restuple);
8646 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008648 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008650 else
8651 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8654 Py_DECREF(restuple);
8655 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008656 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657 Py_INCREF(resunicode);
8658 Py_DECREF(restuple);
8659 return resunicode;
8660}
8661
8662/* Lookup the character ch in the mapping and put the result in result,
8663 which must be decrefed by the caller.
8664 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008665static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667{
Christian Heimes217cfd12007-12-02 14:31:20 +00008668 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 PyObject *x;
8670
8671 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673 x = PyObject_GetItem(mapping, w);
8674 Py_DECREF(w);
8675 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8677 /* No mapping found means: use 1:1 mapping. */
8678 PyErr_Clear();
8679 *result = NULL;
8680 return 0;
8681 } else
8682 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 }
8684 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 *result = x;
8686 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008688 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 long value = PyLong_AS_LONG(x);
8690 long max = PyUnicode_GetMax();
8691 if (value < 0 || value > max) {
8692 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008693 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 Py_DECREF(x);
8695 return -1;
8696 }
8697 *result = x;
8698 return 0;
8699 }
8700 else if (PyUnicode_Check(x)) {
8701 *result = x;
8702 return 0;
8703 }
8704 else {
8705 /* wrong return value */
8706 PyErr_SetString(PyExc_TypeError,
8707 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008708 Py_DECREF(x);
8709 return -1;
8710 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711}
8712/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 if not reallocate and adjust various state variables.
8714 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008715static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008720 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 /* exponentially overallocate to minimize reallocations */
8722 if (requiredsize < 2 * oldsize)
8723 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8725 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728 }
8729 return 0;
8730}
8731/* lookup the character, put the result in the output string and adjust
8732 various state variables. Return a new reference to the object that
8733 was put in the output buffer in *result, or Py_None, if the mapping was
8734 undefined (in which case no character was written).
8735 The called must decref result.
8736 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008737static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8739 PyObject *mapping, Py_UCS4 **output,
8740 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008741 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008743 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8744 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008749 }
8750 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008751 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008752 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008754 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755 }
8756 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757 Py_ssize_t repsize;
8758 if (PyUnicode_READY(*res) == -1)
8759 return -1;
8760 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 if (repsize==1) {
8762 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 }
8765 else if (repsize!=0) {
8766 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 Py_ssize_t requiredsize = *opos +
8768 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 Py_ssize_t i;
8771 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 for(i = 0; i < repsize; i++)
8774 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008776 }
8777 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008779 return 0;
8780}
8781
Alexander Belopolsky40018472011-02-26 01:02:56 +00008782PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783_PyUnicode_TranslateCharmap(PyObject *input,
8784 PyObject *mapping,
8785 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 /* input object */
8788 char *idata;
8789 Py_ssize_t size, i;
8790 int kind;
8791 /* output buffer */
8792 Py_UCS4 *output = NULL;
8793 Py_ssize_t osize;
8794 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008795 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008797 char *reason = "character maps to <undefined>";
8798 PyObject *errorHandler = NULL;
8799 PyObject *exc = NULL;
8800 /* the following variable is used for caching string comparisons
8801 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8802 * 3=ignore, 4=xmlcharrefreplace */
8803 int known_errorHandler = -1;
8804
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008806 PyErr_BadArgument();
8807 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810 if (PyUnicode_READY(input) == -1)
8811 return NULL;
8812 idata = (char*)PyUnicode_DATA(input);
8813 kind = PyUnicode_KIND(input);
8814 size = PyUnicode_GET_LENGTH(input);
8815 i = 0;
8816
8817 if (size == 0) {
8818 Py_INCREF(input);
8819 return input;
8820 }
8821
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008822 /* allocate enough for a simple 1:1 translation without
8823 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 osize = size;
8825 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8826 opos = 0;
8827 if (output == NULL) {
8828 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 /* try to encode it */
8834 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 if (charmaptranslate_output(input, i, mapping,
8836 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 Py_XDECREF(x);
8838 goto onError;
8839 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008840 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 else { /* untranslatable character */
8844 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8845 Py_ssize_t repsize;
8846 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008847 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008848 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 Py_ssize_t collstart = i;
8850 Py_ssize_t collend = i+1;
8851 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 while (collend < size) {
8855 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008856 goto onError;
8857 Py_XDECREF(x);
8858 if (x!=Py_None)
8859 break;
8860 ++collend;
8861 }
8862 /* cache callback name lookup
8863 * (if not done yet, i.e. it's the first error) */
8864 if (known_errorHandler==-1) {
8865 if ((errors==NULL) || (!strcmp(errors, "strict")))
8866 known_errorHandler = 1;
8867 else if (!strcmp(errors, "replace"))
8868 known_errorHandler = 2;
8869 else if (!strcmp(errors, "ignore"))
8870 known_errorHandler = 3;
8871 else if (!strcmp(errors, "xmlcharrefreplace"))
8872 known_errorHandler = 4;
8873 else
8874 known_errorHandler = 0;
8875 }
8876 switch (known_errorHandler) {
8877 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 raise_translate_exception(&exc, input, collstart,
8879 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008880 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 case 2: /* replace */
8882 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 for (coll = collstart; coll<collend; coll++)
8884 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 /* fall through */
8886 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008887 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 break;
8889 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008890 /* generate replacement (temporarily (mis)uses i) */
8891 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 char buffer[2+29+1+1];
8893 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8895 if (charmaptranslate_makespace(&output, &osize,
8896 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 goto onError;
8898 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008902 break;
8903 default:
8904 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905 reason, input, &exc,
8906 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008907 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008909 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008910 Py_DECREF(repunicode);
8911 goto onError;
8912 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008913 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 repsize = PyUnicode_GET_LENGTH(repunicode);
8915 if (charmaptranslate_makespace(&output, &osize,
8916 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008917 Py_DECREF(repunicode);
8918 goto onError;
8919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920 for (uni2 = 0; repsize-->0; ++uni2)
8921 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8922 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008924 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008925 }
8926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8928 if (!res)
8929 goto onError;
8930 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008931 Py_XDECREF(exc);
8932 Py_XDECREF(errorHandler);
8933 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008937 Py_XDECREF(exc);
8938 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 return NULL;
8940}
8941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942/* Deprecated. Use PyUnicode_Translate instead. */
8943PyObject *
8944PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8945 Py_ssize_t size,
8946 PyObject *mapping,
8947 const char *errors)
8948{
8949 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8950 if (!unicode)
8951 return NULL;
8952 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8953}
8954
Alexander Belopolsky40018472011-02-26 01:02:56 +00008955PyObject *
8956PyUnicode_Translate(PyObject *str,
8957 PyObject *mapping,
8958 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959{
8960 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008961
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 str = PyUnicode_FromObject(str);
8963 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 Py_DECREF(str);
8967 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008968
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970 Py_XDECREF(str);
8971 return NULL;
8972}
Tim Petersced69f82003-09-16 20:30:58 +00008973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008975fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976{
8977 /* No need to call PyUnicode_READY(self) because this function is only
8978 called as a callback from fixup() which does it already. */
8979 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8980 const int kind = PyUnicode_KIND(self);
8981 void *data = PyUnicode_DATA(self);
8982 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008983 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 Py_ssize_t i;
8985
8986 for (i = 0; i < len; ++i) {
8987 ch = PyUnicode_READ(kind, data, i);
8988 fixed = 0;
8989 if (ch > 127) {
8990 if (Py_UNICODE_ISSPACE(ch))
8991 fixed = ' ';
8992 else {
8993 const int decimal = Py_UNICODE_TODECIMAL(ch);
8994 if (decimal >= 0)
8995 fixed = '0' + decimal;
8996 }
8997 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008998 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999 if (fixed > maxchar)
9000 maxchar = fixed;
9001 PyUnicode_WRITE(kind, data, i, fixed);
9002 }
9003 else if (ch > maxchar)
9004 maxchar = ch;
9005 }
9006 else if (ch > maxchar)
9007 maxchar = ch;
9008 }
9009
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009010 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011}
9012
9013PyObject *
9014_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9015{
9016 if (!PyUnicode_Check(unicode)) {
9017 PyErr_BadInternalCall();
9018 return NULL;
9019 }
9020 if (PyUnicode_READY(unicode) == -1)
9021 return NULL;
9022 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9023 /* If the string is already ASCII, just return the same string */
9024 Py_INCREF(unicode);
9025 return unicode;
9026 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009027 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028}
9029
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009030PyObject *
9031PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9032 Py_ssize_t length)
9033{
Victor Stinnerf0124502011-11-21 23:12:56 +01009034 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009035 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009036 Py_UCS4 maxchar;
9037 enum PyUnicode_Kind kind;
9038 void *data;
9039
Victor Stinner99d7ad02012-02-22 13:37:39 +01009040 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009041 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009042 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009043 if (ch > 127) {
9044 int decimal = Py_UNICODE_TODECIMAL(ch);
9045 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009046 ch = '0' + decimal;
Victor Stinner99d7ad02012-02-22 13:37:39 +01009047 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009048 }
9049 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009050
9051 /* Copy to a new string */
9052 decimal = PyUnicode_New(length, maxchar);
9053 if (decimal == NULL)
9054 return decimal;
9055 kind = PyUnicode_KIND(decimal);
9056 data = PyUnicode_DATA(decimal);
9057 /* Iterate over code points */
9058 for (i = 0; i < length; i++) {
9059 Py_UNICODE ch = s[i];
9060 if (ch > 127) {
9061 int decimal = Py_UNICODE_TODECIMAL(ch);
9062 if (decimal >= 0)
9063 ch = '0' + decimal;
9064 }
9065 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009067 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009068}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009069/* --- Decimal Encoder ---------------------------------------------------- */
9070
Alexander Belopolsky40018472011-02-26 01:02:56 +00009071int
9072PyUnicode_EncodeDecimal(Py_UNICODE *s,
9073 Py_ssize_t length,
9074 char *output,
9075 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009076{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009077 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009078 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009079 enum PyUnicode_Kind kind;
9080 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009081
9082 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009083 PyErr_BadArgument();
9084 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009085 }
9086
Victor Stinner42bf7752011-11-21 22:52:58 +01009087 unicode = PyUnicode_FromUnicode(s, length);
9088 if (unicode == NULL)
9089 return -1;
9090
Benjamin Petersonbac79492012-01-14 13:34:47 -05009091 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009092 Py_DECREF(unicode);
9093 return -1;
9094 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009095 kind = PyUnicode_KIND(unicode);
9096 data = PyUnicode_DATA(unicode);
9097
Victor Stinnerb84d7232011-11-22 01:50:07 +01009098 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009099 PyObject *exc;
9100 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009101 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009102 Py_ssize_t startpos;
9103
9104 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009105
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009107 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009108 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009109 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009110 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 decimal = Py_UNICODE_TODECIMAL(ch);
9112 if (decimal >= 0) {
9113 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009114 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 continue;
9116 }
9117 if (0 < ch && ch < 256) {
9118 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009119 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 continue;
9121 }
Victor Stinner6345be92011-11-25 20:09:01 +01009122
Victor Stinner42bf7752011-11-21 22:52:58 +01009123 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009124 exc = NULL;
9125 raise_encode_exception(&exc, "decimal", unicode,
9126 startpos, startpos+1,
9127 "invalid decimal Unicode string");
9128 Py_XDECREF(exc);
9129 Py_DECREF(unicode);
9130 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009131 }
9132 /* 0-terminate the output string */
9133 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009134 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009135 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009136}
9137
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138/* --- Helpers ------------------------------------------------------------ */
9139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009140static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009141any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 Py_ssize_t start,
9143 Py_ssize_t end)
9144{
9145 int kind1, kind2, kind;
9146 void *buf1, *buf2;
9147 Py_ssize_t len1, len2, result;
9148
9149 kind1 = PyUnicode_KIND(s1);
9150 kind2 = PyUnicode_KIND(s2);
9151 kind = kind1 > kind2 ? kind1 : kind2;
9152 buf1 = PyUnicode_DATA(s1);
9153 buf2 = PyUnicode_DATA(s2);
9154 if (kind1 != kind)
9155 buf1 = _PyUnicode_AsKind(s1, kind);
9156 if (!buf1)
9157 return -2;
9158 if (kind2 != kind)
9159 buf2 = _PyUnicode_AsKind(s2, kind);
9160 if (!buf2) {
9161 if (kind1 != kind) PyMem_Free(buf1);
9162 return -2;
9163 }
9164 len1 = PyUnicode_GET_LENGTH(s1);
9165 len2 = PyUnicode_GET_LENGTH(s2);
9166
Victor Stinner794d5672011-10-10 03:21:36 +02009167 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009168 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009169 case PyUnicode_1BYTE_KIND:
9170 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9171 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9172 else
9173 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9174 break;
9175 case PyUnicode_2BYTE_KIND:
9176 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9177 break;
9178 case PyUnicode_4BYTE_KIND:
9179 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9180 break;
9181 default:
9182 assert(0); result = -2;
9183 }
9184 }
9185 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009186 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009187 case PyUnicode_1BYTE_KIND:
9188 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9189 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9190 else
9191 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9192 break;
9193 case PyUnicode_2BYTE_KIND:
9194 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9195 break;
9196 case PyUnicode_4BYTE_KIND:
9197 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9198 break;
9199 default:
9200 assert(0); result = -2;
9201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 }
9203
9204 if (kind1 != kind)
9205 PyMem_Free(buf1);
9206 if (kind2 != kind)
9207 PyMem_Free(buf2);
9208
9209 return result;
9210}
9211
9212Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009213_PyUnicode_InsertThousandsGrouping(
9214 PyObject *unicode, Py_ssize_t index,
9215 Py_ssize_t n_buffer,
9216 void *digits, Py_ssize_t n_digits,
9217 Py_ssize_t min_width,
9218 const char *grouping, PyObject *thousands_sep,
9219 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220{
Victor Stinner41a863c2012-02-24 00:37:51 +01009221 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009222 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009223 Py_ssize_t thousands_sep_len;
9224 Py_ssize_t len;
9225
9226 if (unicode != NULL) {
9227 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009228 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009229 }
9230 else {
9231 kind = PyUnicode_1BYTE_KIND;
9232 data = NULL;
9233 }
9234 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9235 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9236 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9237 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009238 if (thousands_sep_kind < kind) {
9239 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9240 if (!thousands_sep_data)
9241 return -1;
9242 }
9243 else {
9244 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9245 if (!data)
9246 return -1;
9247 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009248 }
9249
Benjamin Petersonead6b532011-12-20 17:23:42 -06009250 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009252 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009253 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009254 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009255 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009256 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009257 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009258 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009259 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009260 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009261 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009262 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009264 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009265 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009266 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009267 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009268 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009270 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009271 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009272 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009273 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009274 break;
9275 default:
9276 assert(0);
9277 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009279 if (unicode != NULL && thousands_sep_kind != kind) {
9280 if (thousands_sep_kind < kind)
9281 PyMem_Free(thousands_sep_data);
9282 else
9283 PyMem_Free(data);
9284 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009285 if (unicode == NULL) {
9286 *maxchar = 127;
9287 if (len != n_digits) {
9288 *maxchar = Py_MAX(*maxchar,
9289 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9290 }
9291 }
9292 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293}
9294
9295
Thomas Wouters477c8d52006-05-27 19:21:47 +00009296/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009297#define ADJUST_INDICES(start, end, len) \
9298 if (end > len) \
9299 end = len; \
9300 else if (end < 0) { \
9301 end += len; \
9302 if (end < 0) \
9303 end = 0; \
9304 } \
9305 if (start < 0) { \
9306 start += len; \
9307 if (start < 0) \
9308 start = 0; \
9309 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009310
Alexander Belopolsky40018472011-02-26 01:02:56 +00009311Py_ssize_t
9312PyUnicode_Count(PyObject *str,
9313 PyObject *substr,
9314 Py_ssize_t start,
9315 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009317 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009318 PyObject* str_obj;
9319 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 int kind1, kind2, kind;
9321 void *buf1 = NULL, *buf2 = NULL;
9322 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009323
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009324 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009325 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009326 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009327 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009328 if (!sub_obj) {
9329 Py_DECREF(str_obj);
9330 return -1;
9331 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009332 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009333 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009334 Py_DECREF(str_obj);
9335 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009336 }
Tim Petersced69f82003-09-16 20:30:58 +00009337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 kind1 = PyUnicode_KIND(str_obj);
9339 kind2 = PyUnicode_KIND(sub_obj);
9340 kind = kind1 > kind2 ? kind1 : kind2;
9341 buf1 = PyUnicode_DATA(str_obj);
9342 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009343 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 if (!buf1)
9345 goto onError;
9346 buf2 = PyUnicode_DATA(sub_obj);
9347 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009348 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 if (!buf2)
9350 goto onError;
9351 len1 = PyUnicode_GET_LENGTH(str_obj);
9352 len2 = PyUnicode_GET_LENGTH(sub_obj);
9353
9354 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009355 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009357 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9358 result = asciilib_count(
9359 ((Py_UCS1*)buf1) + start, end - start,
9360 buf2, len2, PY_SSIZE_T_MAX
9361 );
9362 else
9363 result = ucs1lib_count(
9364 ((Py_UCS1*)buf1) + start, end - start,
9365 buf2, len2, PY_SSIZE_T_MAX
9366 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 break;
9368 case PyUnicode_2BYTE_KIND:
9369 result = ucs2lib_count(
9370 ((Py_UCS2*)buf1) + start, end - start,
9371 buf2, len2, PY_SSIZE_T_MAX
9372 );
9373 break;
9374 case PyUnicode_4BYTE_KIND:
9375 result = ucs4lib_count(
9376 ((Py_UCS4*)buf1) + start, end - start,
9377 buf2, len2, PY_SSIZE_T_MAX
9378 );
9379 break;
9380 default:
9381 assert(0); result = 0;
9382 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009383
9384 Py_DECREF(sub_obj);
9385 Py_DECREF(str_obj);
9386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 if (kind1 != kind)
9388 PyMem_Free(buf1);
9389 if (kind2 != kind)
9390 PyMem_Free(buf2);
9391
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 onError:
9394 Py_DECREF(sub_obj);
9395 Py_DECREF(str_obj);
9396 if (kind1 != kind && buf1)
9397 PyMem_Free(buf1);
9398 if (kind2 != kind && buf2)
9399 PyMem_Free(buf2);
9400 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401}
9402
Alexander Belopolsky40018472011-02-26 01:02:56 +00009403Py_ssize_t
9404PyUnicode_Find(PyObject *str,
9405 PyObject *sub,
9406 Py_ssize_t start,
9407 Py_ssize_t end,
9408 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009410 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009411
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009413 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009415 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009416 if (!sub) {
9417 Py_DECREF(str);
9418 return -2;
9419 }
9420 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9421 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009422 Py_DECREF(str);
9423 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424 }
Tim Petersced69f82003-09-16 20:30:58 +00009425
Victor Stinner794d5672011-10-10 03:21:36 +02009426 result = any_find_slice(direction,
9427 str, sub, start, end
9428 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009429
Guido van Rossumd57fd912000-03-10 22:53:23 +00009430 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009431 Py_DECREF(sub);
9432
Guido van Rossumd57fd912000-03-10 22:53:23 +00009433 return result;
9434}
9435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436Py_ssize_t
9437PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9438 Py_ssize_t start, Py_ssize_t end,
9439 int direction)
9440{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009442 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 if (PyUnicode_READY(str) == -1)
9444 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009445 if (start < 0 || end < 0) {
9446 PyErr_SetString(PyExc_IndexError, "string index out of range");
9447 return -2;
9448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 if (end > PyUnicode_GET_LENGTH(str))
9450 end = PyUnicode_GET_LENGTH(str);
9451 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009452 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9453 kind, end-start, ch, direction);
9454 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009456 else
9457 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458}
9459
Alexander Belopolsky40018472011-02-26 01:02:56 +00009460static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009461tailmatch(PyObject *self,
9462 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009463 Py_ssize_t start,
9464 Py_ssize_t end,
9465 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 int kind_self;
9468 int kind_sub;
9469 void *data_self;
9470 void *data_sub;
9471 Py_ssize_t offset;
9472 Py_ssize_t i;
9473 Py_ssize_t end_sub;
9474
9475 if (PyUnicode_READY(self) == -1 ||
9476 PyUnicode_READY(substring) == -1)
9477 return 0;
9478
9479 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 return 1;
9481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9483 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009485 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 kind_self = PyUnicode_KIND(self);
9488 data_self = PyUnicode_DATA(self);
9489 kind_sub = PyUnicode_KIND(substring);
9490 data_sub = PyUnicode_DATA(substring);
9491 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9492
9493 if (direction > 0)
9494 offset = end;
9495 else
9496 offset = start;
9497
9498 if (PyUnicode_READ(kind_self, data_self, offset) ==
9499 PyUnicode_READ(kind_sub, data_sub, 0) &&
9500 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9501 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9502 /* If both are of the same kind, memcmp is sufficient */
9503 if (kind_self == kind_sub) {
9504 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009505 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 data_sub,
9507 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009508 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 }
9510 /* otherwise we have to compare each character by first accesing it */
9511 else {
9512 /* We do not need to compare 0 and len(substring)-1 because
9513 the if statement above ensured already that they are equal
9514 when we end up here. */
9515 // TODO: honor direction and do a forward or backwards search
9516 for (i = 1; i < end_sub; ++i) {
9517 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9518 PyUnicode_READ(kind_sub, data_sub, i))
9519 return 0;
9520 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009521 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523 }
9524
9525 return 0;
9526}
9527
Alexander Belopolsky40018472011-02-26 01:02:56 +00009528Py_ssize_t
9529PyUnicode_Tailmatch(PyObject *str,
9530 PyObject *substr,
9531 Py_ssize_t start,
9532 Py_ssize_t end,
9533 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009535 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009536
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 str = PyUnicode_FromObject(str);
9538 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540 substr = PyUnicode_FromObject(substr);
9541 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 Py_DECREF(str);
9543 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544 }
Tim Petersced69f82003-09-16 20:30:58 +00009545
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009546 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548 Py_DECREF(str);
9549 Py_DECREF(substr);
9550 return result;
9551}
9552
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553/* Apply fixfct filter to the Unicode object self and return a
9554 reference to the modified object */
9555
Alexander Belopolsky40018472011-02-26 01:02:56 +00009556static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009557fixup(PyObject *self,
9558 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 PyObject *u;
9561 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009562 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009564 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009566 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009567 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 /* fix functions return the new maximum character in a string,
9570 if the kind of the resulting unicode object does not change,
9571 everything is fine. Otherwise we need to change the string kind
9572 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009573 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009574
9575 if (maxchar_new == 0) {
9576 /* no changes */;
9577 if (PyUnicode_CheckExact(self)) {
9578 Py_DECREF(u);
9579 Py_INCREF(self);
9580 return self;
9581 }
9582 else
9583 return u;
9584 }
9585
9586 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009587 maxchar_new = 127;
9588 else if (maxchar_new <= 255)
9589 maxchar_new = 255;
9590 else if (maxchar_new <= 65535)
9591 maxchar_new = 65535;
9592 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009593 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594
Victor Stinnereaab6042011-12-11 22:22:39 +01009595 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009597
9598 /* In case the maximum character changed, we need to
9599 convert the string to the new category. */
9600 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9601 if (v == NULL) {
9602 Py_DECREF(u);
9603 return NULL;
9604 }
9605 if (maxchar_new > maxchar_old) {
9606 /* If the maxchar increased so that the kind changed, not all
9607 characters are representable anymore and we need to fix the
9608 string again. This only happens in very few cases. */
9609 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9610 maxchar_old = fixfct(v);
9611 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 }
9613 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009614 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009616 Py_DECREF(u);
9617 assert(_PyUnicode_CheckConsistency(v, 1));
9618 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619}
9620
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009621static PyObject *
9622ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009624 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9625 char *resdata, *data = PyUnicode_DATA(self);
9626 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009627
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009628 res = PyUnicode_New(len, 127);
9629 if (res == NULL)
9630 return NULL;
9631 resdata = PyUnicode_DATA(res);
9632 if (lower)
9633 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009635 _Py_bytes_upper(resdata, data, len);
9636 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637}
9638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009639static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009640handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009642 Py_ssize_t j;
9643 int final_sigma;
9644 Py_UCS4 c;
9645 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009646
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009647 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9648
9649 where ! is a negation and \p{xxx} is a character with property xxx.
9650 */
9651 for (j = i - 1; j >= 0; j--) {
9652 c = PyUnicode_READ(kind, data, j);
9653 if (!_PyUnicode_IsCaseIgnorable(c))
9654 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009656 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9657 if (final_sigma) {
9658 for (j = i + 1; j < length; j++) {
9659 c = PyUnicode_READ(kind, data, j);
9660 if (!_PyUnicode_IsCaseIgnorable(c))
9661 break;
9662 }
9663 final_sigma = j == length || !_PyUnicode_IsCased(c);
9664 }
9665 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666}
9667
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668static int
9669lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9670 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009672 /* Obscure special case. */
9673 if (c == 0x3A3) {
9674 mapped[0] = handle_capital_sigma(kind, data, length, i);
9675 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009677 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678}
9679
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680static Py_ssize_t
9681do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 Py_ssize_t i, k = 0;
9684 int n_res, j;
9685 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009686
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009687 c = PyUnicode_READ(kind, data, 0);
9688 n_res = _PyUnicode_ToUpperFull(c, mapped);
9689 for (j = 0; j < n_res; j++) {
9690 if (mapped[j] > *maxchar)
9691 *maxchar = mapped[j];
9692 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 for (i = 1; i < length; i++) {
9695 c = PyUnicode_READ(kind, data, i);
9696 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9697 for (j = 0; j < n_res; j++) {
9698 if (mapped[j] > *maxchar)
9699 *maxchar = mapped[j];
9700 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009701 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009702 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009703 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704}
9705
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706static Py_ssize_t
9707do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9708 Py_ssize_t i, k = 0;
9709
9710 for (i = 0; i < length; i++) {
9711 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9712 int n_res, j;
9713 if (Py_UNICODE_ISUPPER(c)) {
9714 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9715 }
9716 else if (Py_UNICODE_ISLOWER(c)) {
9717 n_res = _PyUnicode_ToUpperFull(c, mapped);
9718 }
9719 else {
9720 n_res = 1;
9721 mapped[0] = c;
9722 }
9723 for (j = 0; j < n_res; j++) {
9724 if (mapped[j] > *maxchar)
9725 *maxchar = mapped[j];
9726 res[k++] = mapped[j];
9727 }
9728 }
9729 return k;
9730}
9731
9732static Py_ssize_t
9733do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9734 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009736 Py_ssize_t i, k = 0;
9737
9738 for (i = 0; i < length; i++) {
9739 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9740 int n_res, j;
9741 if (lower)
9742 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9743 else
9744 n_res = _PyUnicode_ToUpperFull(c, mapped);
9745 for (j = 0; j < n_res; j++) {
9746 if (mapped[j] > *maxchar)
9747 *maxchar = mapped[j];
9748 res[k++] = mapped[j];
9749 }
9750 }
9751 return k;
9752}
9753
9754static Py_ssize_t
9755do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9756{
9757 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9758}
9759
9760static Py_ssize_t
9761do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9762{
9763 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9764}
9765
Benjamin Petersone51757f2012-01-12 21:10:29 -05009766static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009767do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9768{
9769 Py_ssize_t i, k = 0;
9770
9771 for (i = 0; i < length; i++) {
9772 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9773 Py_UCS4 mapped[3];
9774 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9775 for (j = 0; j < n_res; j++) {
9776 if (mapped[j] > *maxchar)
9777 *maxchar = mapped[j];
9778 res[k++] = mapped[j];
9779 }
9780 }
9781 return k;
9782}
9783
9784static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009785do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9786{
9787 Py_ssize_t i, k = 0;
9788 int previous_is_cased;
9789
9790 previous_is_cased = 0;
9791 for (i = 0; i < length; i++) {
9792 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9793 Py_UCS4 mapped[3];
9794 int n_res, j;
9795
9796 if (previous_is_cased)
9797 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9798 else
9799 n_res = _PyUnicode_ToTitleFull(c, mapped);
9800
9801 for (j = 0; j < n_res; j++) {
9802 if (mapped[j] > *maxchar)
9803 *maxchar = mapped[j];
9804 res[k++] = mapped[j];
9805 }
9806
9807 previous_is_cased = _PyUnicode_IsCased(c);
9808 }
9809 return k;
9810}
9811
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009812static PyObject *
9813case_operation(PyObject *self,
9814 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9815{
9816 PyObject *res = NULL;
9817 Py_ssize_t length, newlength = 0;
9818 int kind, outkind;
9819 void *data, *outdata;
9820 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9821
Benjamin Petersoneea48462012-01-16 14:28:50 -05009822 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009823
9824 kind = PyUnicode_KIND(self);
9825 data = PyUnicode_DATA(self);
9826 length = PyUnicode_GET_LENGTH(self);
9827 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9828 if (tmp == NULL)
9829 return PyErr_NoMemory();
9830 newlength = perform(kind, data, length, tmp, &maxchar);
9831 res = PyUnicode_New(newlength, maxchar);
9832 if (res == NULL)
9833 goto leave;
9834 tmpend = tmp + newlength;
9835 outdata = PyUnicode_DATA(res);
9836 outkind = PyUnicode_KIND(res);
9837 switch (outkind) {
9838 case PyUnicode_1BYTE_KIND:
9839 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9840 break;
9841 case PyUnicode_2BYTE_KIND:
9842 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9843 break;
9844 case PyUnicode_4BYTE_KIND:
9845 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9846 break;
9847 default:
9848 assert(0);
9849 break;
9850 }
9851 leave:
9852 PyMem_FREE(tmp);
9853 return res;
9854}
9855
Tim Peters8ce9f162004-08-27 01:49:32 +00009856PyObject *
9857PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009860 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009862 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009863 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9864 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009865 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009867 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009869 int use_memcpy;
9870 unsigned char *res_data = NULL, *sep_data = NULL;
9871 PyObject *last_obj;
9872 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873
Tim Peters05eba1f2004-08-27 21:32:02 +00009874 fseq = PySequence_Fast(seq, "");
9875 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009876 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009877 }
9878
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009879 /* NOTE: the following code can't call back into Python code,
9880 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009881 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009882
Tim Peters05eba1f2004-08-27 21:32:02 +00009883 seqlen = PySequence_Fast_GET_SIZE(fseq);
9884 /* If empty sequence, return u"". */
9885 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009886 Py_DECREF(fseq);
9887 Py_INCREF(unicode_empty);
9888 res = unicode_empty;
9889 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009890 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009891
Tim Peters05eba1f2004-08-27 21:32:02 +00009892 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009893 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009894 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009895 if (seqlen == 1) {
9896 if (PyUnicode_CheckExact(items[0])) {
9897 res = items[0];
9898 Py_INCREF(res);
9899 Py_DECREF(fseq);
9900 return res;
9901 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009902 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009903 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009904 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009905 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009906 /* Set up sep and seplen */
9907 if (separator == NULL) {
9908 /* fall back to a blank space separator */
9909 sep = PyUnicode_FromOrdinal(' ');
9910 if (!sep)
9911 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009912 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009913 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009914 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009915 else {
9916 if (!PyUnicode_Check(separator)) {
9917 PyErr_Format(PyExc_TypeError,
9918 "separator: expected str instance,"
9919 " %.80s found",
9920 Py_TYPE(separator)->tp_name);
9921 goto onError;
9922 }
9923 if (PyUnicode_READY(separator))
9924 goto onError;
9925 sep = separator;
9926 seplen = PyUnicode_GET_LENGTH(separator);
9927 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9928 /* inc refcount to keep this code path symmetric with the
9929 above case of a blank separator */
9930 Py_INCREF(sep);
9931 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009932 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009933 }
9934
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009935 /* There are at least two things to join, or else we have a subclass
9936 * of str in the sequence.
9937 * Do a pre-pass to figure out the total amount of space we'll
9938 * need (sz), and see whether all argument are strings.
9939 */
9940 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009941#ifdef Py_DEBUG
9942 use_memcpy = 0;
9943#else
9944 use_memcpy = 1;
9945#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009946 for (i = 0; i < seqlen; i++) {
9947 const Py_ssize_t old_sz = sz;
9948 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009949 if (!PyUnicode_Check(item)) {
9950 PyErr_Format(PyExc_TypeError,
9951 "sequence item %zd: expected str instance,"
9952 " %.80s found",
9953 i, Py_TYPE(item)->tp_name);
9954 goto onError;
9955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 if (PyUnicode_READY(item) == -1)
9957 goto onError;
9958 sz += PyUnicode_GET_LENGTH(item);
9959 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009960 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009961 if (i != 0)
9962 sz += seplen;
9963 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9964 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009965 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009966 goto onError;
9967 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009968 if (use_memcpy && last_obj != NULL) {
9969 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9970 use_memcpy = 0;
9971 }
9972 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009973 }
Tim Petersced69f82003-09-16 20:30:58 +00009974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009976 if (res == NULL)
9977 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009978
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009979 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009980#ifdef Py_DEBUG
9981 use_memcpy = 0;
9982#else
9983 if (use_memcpy) {
9984 res_data = PyUnicode_1BYTE_DATA(res);
9985 kind = PyUnicode_KIND(res);
9986 if (seplen != 0)
9987 sep_data = PyUnicode_1BYTE_DATA(sep);
9988 }
9989#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009991 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009992 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009993 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009994 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009995 if (use_memcpy) {
9996 Py_MEMCPY(res_data,
9997 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009998 kind * seplen);
9999 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010000 }
10001 else {
10002 copy_characters(res, res_offset, sep, 0, seplen);
10003 res_offset += seplen;
10004 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010005 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010006 itemlen = PyUnicode_GET_LENGTH(item);
10007 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010008 if (use_memcpy) {
10009 Py_MEMCPY(res_data,
10010 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010011 kind * itemlen);
10012 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010013 }
10014 else {
10015 copy_characters(res, res_offset, item, 0, itemlen);
10016 res_offset += itemlen;
10017 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010018 }
Tim Peters05eba1f2004-08-27 21:32:02 +000010019 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010020 if (use_memcpy)
10021 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010022 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +020010023 else
10024 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +000010025
Tim Peters05eba1f2004-08-27 21:32:02 +000010026 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010028 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030
Benjamin Peterson29060642009-01-31 22:14:21 +000010031 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010032 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010034 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035 return NULL;
10036}
10037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038#define FILL(kind, data, value, start, length) \
10039 do { \
10040 Py_ssize_t i_ = 0; \
10041 assert(kind != PyUnicode_WCHAR_KIND); \
10042 switch ((kind)) { \
10043 case PyUnicode_1BYTE_KIND: { \
10044 unsigned char * to_ = (unsigned char *)((data)) + (start); \
10045 memset(to_, (unsigned char)value, length); \
10046 break; \
10047 } \
10048 case PyUnicode_2BYTE_KIND: { \
10049 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10050 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10051 break; \
10052 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010053 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10055 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10056 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010057 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 } \
10059 } \
10060 } while (0)
10061
Victor Stinner3fe55312012-01-04 00:33:50 +010010062Py_ssize_t
10063PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10064 Py_UCS4 fill_char)
10065{
10066 Py_ssize_t maxlen;
10067 enum PyUnicode_Kind kind;
10068 void *data;
10069
10070 if (!PyUnicode_Check(unicode)) {
10071 PyErr_BadInternalCall();
10072 return -1;
10073 }
10074 if (PyUnicode_READY(unicode) == -1)
10075 return -1;
10076 if (unicode_check_modifiable(unicode))
10077 return -1;
10078
10079 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10080 PyErr_SetString(PyExc_ValueError,
10081 "fill character is bigger than "
10082 "the string maximum character");
10083 return -1;
10084 }
10085
10086 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10087 length = Py_MIN(maxlen, length);
10088 if (length <= 0)
10089 return 0;
10090
10091 kind = PyUnicode_KIND(unicode);
10092 data = PyUnicode_DATA(unicode);
10093 FILL(kind, data, fill_char, start, length);
10094 return length;
10095}
10096
Victor Stinner9310abb2011-10-05 00:59:23 +020010097static PyObject *
10098pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010099 Py_ssize_t left,
10100 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 PyObject *u;
10104 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010105 int kind;
10106 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107
10108 if (left < 0)
10109 left = 0;
10110 if (right < 0)
10111 right = 0;
10112
Victor Stinnerc4b49542011-12-11 22:44:26 +010010113 if (left == 0 && right == 0)
10114 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10117 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010118 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10119 return NULL;
10120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10122 if (fill > maxchar)
10123 maxchar = fill;
10124 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010125 if (!u)
10126 return NULL;
10127
10128 kind = PyUnicode_KIND(u);
10129 data = PyUnicode_DATA(u);
10130 if (left)
10131 FILL(kind, data, fill, 0, left);
10132 if (right)
10133 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010134 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010135 assert(_PyUnicode_CheckConsistency(u, 1));
10136 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137}
10138
Alexander Belopolsky40018472011-02-26 01:02:56 +000010139PyObject *
10140PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143
10144 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010145 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010146 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010147 if (PyUnicode_READY(string) == -1) {
10148 Py_DECREF(string);
10149 return NULL;
10150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151
Benjamin Petersonead6b532011-12-20 17:23:42 -060010152 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010154 if (PyUnicode_IS_ASCII(string))
10155 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010156 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010157 PyUnicode_GET_LENGTH(string), keepends);
10158 else
10159 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010160 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010161 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 break;
10163 case PyUnicode_2BYTE_KIND:
10164 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010165 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 PyUnicode_GET_LENGTH(string), keepends);
10167 break;
10168 case PyUnicode_4BYTE_KIND:
10169 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010170 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 PyUnicode_GET_LENGTH(string), keepends);
10172 break;
10173 default:
10174 assert(0);
10175 list = 0;
10176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177 Py_DECREF(string);
10178 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179}
10180
Alexander Belopolsky40018472011-02-26 01:02:56 +000010181static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010182split(PyObject *self,
10183 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010184 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 int kind1, kind2, kind;
10187 void *buf1, *buf2;
10188 Py_ssize_t len1, len2;
10189 PyObject* out;
10190
Guido van Rossumd57fd912000-03-10 22:53:23 +000010191 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010192 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 if (PyUnicode_READY(self) == -1)
10195 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010198 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010200 if (PyUnicode_IS_ASCII(self))
10201 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010202 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010203 PyUnicode_GET_LENGTH(self), maxcount
10204 );
10205 else
10206 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010207 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010208 PyUnicode_GET_LENGTH(self), maxcount
10209 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 case PyUnicode_2BYTE_KIND:
10211 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010212 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 PyUnicode_GET_LENGTH(self), maxcount
10214 );
10215 case PyUnicode_4BYTE_KIND:
10216 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010217 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 PyUnicode_GET_LENGTH(self), maxcount
10219 );
10220 default:
10221 assert(0);
10222 return NULL;
10223 }
10224
10225 if (PyUnicode_READY(substring) == -1)
10226 return NULL;
10227
10228 kind1 = PyUnicode_KIND(self);
10229 kind2 = PyUnicode_KIND(substring);
10230 kind = kind1 > kind2 ? kind1 : kind2;
10231 buf1 = PyUnicode_DATA(self);
10232 buf2 = PyUnicode_DATA(substring);
10233 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010234 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 if (!buf1)
10236 return NULL;
10237 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010238 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 if (!buf2) {
10240 if (kind1 != kind) PyMem_Free(buf1);
10241 return NULL;
10242 }
10243 len1 = PyUnicode_GET_LENGTH(self);
10244 len2 = PyUnicode_GET_LENGTH(substring);
10245
Benjamin Petersonead6b532011-12-20 17:23:42 -060010246 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010248 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10249 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010250 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010251 else
10252 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010253 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 break;
10255 case PyUnicode_2BYTE_KIND:
10256 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010257 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 break;
10259 case PyUnicode_4BYTE_KIND:
10260 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010261 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 break;
10263 default:
10264 out = NULL;
10265 }
10266 if (kind1 != kind)
10267 PyMem_Free(buf1);
10268 if (kind2 != kind)
10269 PyMem_Free(buf2);
10270 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010271}
10272
Alexander Belopolsky40018472011-02-26 01:02:56 +000010273static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010274rsplit(PyObject *self,
10275 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010276 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010277{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 int kind1, kind2, kind;
10279 void *buf1, *buf2;
10280 Py_ssize_t len1, len2;
10281 PyObject* out;
10282
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010283 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010284 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 if (PyUnicode_READY(self) == -1)
10287 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010290 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010292 if (PyUnicode_IS_ASCII(self))
10293 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010294 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010295 PyUnicode_GET_LENGTH(self), maxcount
10296 );
10297 else
10298 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010299 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010300 PyUnicode_GET_LENGTH(self), maxcount
10301 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 case PyUnicode_2BYTE_KIND:
10303 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010304 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 PyUnicode_GET_LENGTH(self), maxcount
10306 );
10307 case PyUnicode_4BYTE_KIND:
10308 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010309 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 PyUnicode_GET_LENGTH(self), maxcount
10311 );
10312 default:
10313 assert(0);
10314 return NULL;
10315 }
10316
10317 if (PyUnicode_READY(substring) == -1)
10318 return NULL;
10319
10320 kind1 = PyUnicode_KIND(self);
10321 kind2 = PyUnicode_KIND(substring);
10322 kind = kind1 > kind2 ? kind1 : kind2;
10323 buf1 = PyUnicode_DATA(self);
10324 buf2 = PyUnicode_DATA(substring);
10325 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010326 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 if (!buf1)
10328 return NULL;
10329 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010330 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 if (!buf2) {
10332 if (kind1 != kind) PyMem_Free(buf1);
10333 return NULL;
10334 }
10335 len1 = PyUnicode_GET_LENGTH(self);
10336 len2 = PyUnicode_GET_LENGTH(substring);
10337
Benjamin Petersonead6b532011-12-20 17:23:42 -060010338 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010340 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10341 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010342 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010343 else
10344 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 break;
10347 case PyUnicode_2BYTE_KIND:
10348 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010349 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 break;
10351 case PyUnicode_4BYTE_KIND:
10352 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010353 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 break;
10355 default:
10356 out = NULL;
10357 }
10358 if (kind1 != kind)
10359 PyMem_Free(buf1);
10360 if (kind2 != kind)
10361 PyMem_Free(buf2);
10362 return out;
10363}
10364
10365static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010366anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10367 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010369 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010371 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10372 return asciilib_find(buf1, len1, buf2, len2, offset);
10373 else
10374 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 case PyUnicode_2BYTE_KIND:
10376 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10377 case PyUnicode_4BYTE_KIND:
10378 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10379 }
10380 assert(0);
10381 return -1;
10382}
10383
10384static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010385anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10386 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010388 switch (kind) {
10389 case PyUnicode_1BYTE_KIND:
10390 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10391 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10392 else
10393 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10394 case PyUnicode_2BYTE_KIND:
10395 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10396 case PyUnicode_4BYTE_KIND:
10397 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10398 }
10399 assert(0);
10400 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010401}
10402
Alexander Belopolsky40018472011-02-26 01:02:56 +000010403static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404replace(PyObject *self, PyObject *str1,
10405 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 PyObject *u;
10408 char *sbuf = PyUnicode_DATA(self);
10409 char *buf1 = PyUnicode_DATA(str1);
10410 char *buf2 = PyUnicode_DATA(str2);
10411 int srelease = 0, release1 = 0, release2 = 0;
10412 int skind = PyUnicode_KIND(self);
10413 int kind1 = PyUnicode_KIND(str1);
10414 int kind2 = PyUnicode_KIND(str2);
10415 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10416 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10417 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010418 int mayshrink;
10419 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420
10421 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010422 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010424 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425
Victor Stinner59de0ee2011-10-07 10:01:28 +020010426 if (str1 == str2)
10427 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 if (skind < kind1)
10429 /* substring too wide to be present */
10430 goto nothing;
10431
Victor Stinner49a0a212011-10-12 23:46:10 +020010432 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10433 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10434 /* Replacing str1 with str2 may cause a maxchar reduction in the
10435 result string. */
10436 mayshrink = (maxchar_str2 < maxchar);
10437 maxchar = Py_MAX(maxchar, maxchar_str2);
10438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010440 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010442 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010444 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010445 Py_UCS4 u1, u2;
10446 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010447 Py_ssize_t index, pos;
10448 char *src;
10449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010451 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10452 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010453 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010456 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010458 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010460
10461 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10462 index = 0;
10463 src = sbuf;
10464 while (--maxcount)
10465 {
10466 pos++;
10467 src += pos * PyUnicode_KIND(self);
10468 slen -= pos;
10469 index += pos;
10470 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10471 if (pos < 0)
10472 break;
10473 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10474 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010475 }
10476 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 int rkind = skind;
10478 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010479 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 if (kind1 < rkind) {
10482 /* widen substring */
10483 buf1 = _PyUnicode_AsKind(str1, rkind);
10484 if (!buf1) goto error;
10485 release1 = 1;
10486 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010487 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010488 if (i < 0)
10489 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 if (rkind > kind2) {
10491 /* widen replacement */
10492 buf2 = _PyUnicode_AsKind(str2, rkind);
10493 if (!buf2) goto error;
10494 release2 = 1;
10495 }
10496 else if (rkind < kind2) {
10497 /* widen self and buf1 */
10498 rkind = kind2;
10499 if (release1) PyMem_Free(buf1);
10500 sbuf = _PyUnicode_AsKind(self, rkind);
10501 if (!sbuf) goto error;
10502 srelease = 1;
10503 buf1 = _PyUnicode_AsKind(str1, rkind);
10504 if (!buf1) goto error;
10505 release1 = 1;
10506 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010507 u = PyUnicode_New(slen, maxchar);
10508 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010510 assert(PyUnicode_KIND(u) == rkind);
10511 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010512
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010513 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010514 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010515 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010517 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010519
10520 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010521 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010522 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010523 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010524 if (i == -1)
10525 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010526 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010528 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010532 }
10533 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 Py_ssize_t n, i, j, ires;
10535 Py_ssize_t product, new_size;
10536 int rkind = skind;
10537 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010540 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 buf1 = _PyUnicode_AsKind(str1, rkind);
10542 if (!buf1) goto error;
10543 release1 = 1;
10544 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010545 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010546 if (n == 0)
10547 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010549 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 buf2 = _PyUnicode_AsKind(str2, rkind);
10551 if (!buf2) goto error;
10552 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010555 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 rkind = kind2;
10557 sbuf = _PyUnicode_AsKind(self, rkind);
10558 if (!sbuf) goto error;
10559 srelease = 1;
10560 if (release1) PyMem_Free(buf1);
10561 buf1 = _PyUnicode_AsKind(str1, rkind);
10562 if (!buf1) goto error;
10563 release1 = 1;
10564 }
10565 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10566 PyUnicode_GET_LENGTH(str1))); */
10567 product = n * (len2-len1);
10568 if ((product / (len2-len1)) != n) {
10569 PyErr_SetString(PyExc_OverflowError,
10570 "replace string is too long");
10571 goto error;
10572 }
10573 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010574 if (new_size == 0) {
10575 Py_INCREF(unicode_empty);
10576 u = unicode_empty;
10577 goto done;
10578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10580 PyErr_SetString(PyExc_OverflowError,
10581 "replace string is too long");
10582 goto error;
10583 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010584 u = PyUnicode_New(new_size, maxchar);
10585 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010587 assert(PyUnicode_KIND(u) == rkind);
10588 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 ires = i = 0;
10590 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010591 while (n-- > 0) {
10592 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010593 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010594 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010595 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010596 if (j == -1)
10597 break;
10598 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010599 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010600 memcpy(res + rkind * ires,
10601 sbuf + rkind * i,
10602 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010604 }
10605 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010607 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010609 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010613 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010615 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010616 memcpy(res + rkind * ires,
10617 sbuf + rkind * i,
10618 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010619 }
10620 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621 /* interleave */
10622 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010623 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010625 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010627 if (--n <= 0)
10628 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010629 memcpy(res + rkind * ires,
10630 sbuf + rkind * i,
10631 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 ires++;
10633 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010634 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010635 memcpy(res + rkind * ires,
10636 sbuf + rkind * i,
10637 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010638 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010639 }
10640
10641 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010642 unicode_adjust_maxchar(&u);
10643 if (u == NULL)
10644 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010646
10647 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 if (srelease)
10649 PyMem_FREE(sbuf);
10650 if (release1)
10651 PyMem_FREE(buf1);
10652 if (release2)
10653 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010654 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656
Benjamin Peterson29060642009-01-31 22:14:21 +000010657 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010658 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 if (srelease)
10660 PyMem_FREE(sbuf);
10661 if (release1)
10662 PyMem_FREE(buf1);
10663 if (release2)
10664 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010665 return unicode_result_unchanged(self);
10666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 error:
10668 if (srelease && sbuf)
10669 PyMem_FREE(sbuf);
10670 if (release1 && buf1)
10671 PyMem_FREE(buf1);
10672 if (release2 && buf2)
10673 PyMem_FREE(buf2);
10674 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675}
10676
10677/* --- Unicode Object Methods --------------------------------------------- */
10678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010679PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010680 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681\n\
10682Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010683characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684
10685static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010686unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010688 if (PyUnicode_READY(self) == -1)
10689 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010690 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691}
10692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010693PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010694 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695\n\
10696Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010697have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698
10699static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010700unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010702 if (PyUnicode_READY(self) == -1)
10703 return NULL;
10704 if (PyUnicode_GET_LENGTH(self) == 0)
10705 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010706 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707}
10708
Benjamin Petersond5890c82012-01-14 13:23:30 -050010709PyDoc_STRVAR(casefold__doc__,
10710 "S.casefold() -> str\n\
10711\n\
10712Return a version of S suitable for caseless comparisons.");
10713
10714static PyObject *
10715unicode_casefold(PyObject *self)
10716{
10717 if (PyUnicode_READY(self) == -1)
10718 return NULL;
10719 if (PyUnicode_IS_ASCII(self))
10720 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010721 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010722}
10723
10724
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010725/* Argument converter. Coerces to a single unicode character */
10726
10727static int
10728convert_uc(PyObject *obj, void *addr)
10729{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010731 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010732
Benjamin Peterson14339b62009-01-31 16:36:08 +000010733 uniobj = PyUnicode_FromObject(obj);
10734 if (uniobj == NULL) {
10735 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010736 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010737 return 0;
10738 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010740 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010741 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010742 Py_DECREF(uniobj);
10743 return 0;
10744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010746 Py_DECREF(uniobj);
10747 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010748}
10749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010750PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010751 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010753Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010754done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755
10756static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010757unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010759 Py_ssize_t marg, left;
10760 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 Py_UCS4 fillchar = ' ';
10762
Victor Stinnere9a29352011-10-01 02:14:59 +020010763 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765
Benjamin Petersonbac79492012-01-14 13:34:47 -050010766 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767 return NULL;
10768
Victor Stinnerc4b49542011-12-11 22:44:26 +010010769 if (PyUnicode_GET_LENGTH(self) >= width)
10770 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771
Victor Stinnerc4b49542011-12-11 22:44:26 +010010772 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773 left = marg / 2 + (marg & width & 1);
10774
Victor Stinner9310abb2011-10-05 00:59:23 +020010775 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776}
10777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778/* This function assumes that str1 and str2 are readied by the caller. */
10779
Marc-André Lemburge5034372000-08-08 08:04:29 +000010780static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010781unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010782{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 int kind1, kind2;
10784 void *data1, *data2;
10785 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 kind1 = PyUnicode_KIND(str1);
10788 kind2 = PyUnicode_KIND(str2);
10789 data1 = PyUnicode_DATA(str1);
10790 data2 = PyUnicode_DATA(str2);
10791 len1 = PyUnicode_GET_LENGTH(str1);
10792 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 for (i = 0; i < len1 && i < len2; ++i) {
10795 Py_UCS4 c1, c2;
10796 c1 = PyUnicode_READ(kind1, data1, i);
10797 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010798
10799 if (c1 != c2)
10800 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010801 }
10802
10803 return (len1 < len2) ? -1 : (len1 != len2);
10804}
10805
Alexander Belopolsky40018472011-02-26 01:02:56 +000010806int
10807PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10810 if (PyUnicode_READY(left) == -1 ||
10811 PyUnicode_READY(right) == -1)
10812 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010813 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010815 PyErr_Format(PyExc_TypeError,
10816 "Can't compare %.100s and %.100s",
10817 left->ob_type->tp_name,
10818 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819 return -1;
10820}
10821
Martin v. Löwis5b222132007-06-10 09:51:05 +000010822int
10823PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 Py_ssize_t i;
10826 int kind;
10827 void *data;
10828 Py_UCS4 chr;
10829
Victor Stinner910337b2011-10-03 03:20:16 +020010830 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 if (PyUnicode_READY(uni) == -1)
10832 return -1;
10833 kind = PyUnicode_KIND(uni);
10834 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010835 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10837 if (chr != str[i])
10838 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010839 /* This check keeps Python strings that end in '\0' from comparing equal
10840 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010842 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010843 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010844 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010845 return 0;
10846}
10847
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010848
Benjamin Peterson29060642009-01-31 22:14:21 +000010849#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010850 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010851
Alexander Belopolsky40018472011-02-26 01:02:56 +000010852PyObject *
10853PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010854{
10855 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010856
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010857 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10858 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 if (PyUnicode_READY(left) == -1 ||
10860 PyUnicode_READY(right) == -1)
10861 return NULL;
10862 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10863 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010864 if (op == Py_EQ) {
10865 Py_INCREF(Py_False);
10866 return Py_False;
10867 }
10868 if (op == Py_NE) {
10869 Py_INCREF(Py_True);
10870 return Py_True;
10871 }
10872 }
10873 if (left == right)
10874 result = 0;
10875 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010876 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010877
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010878 /* Convert the return value to a Boolean */
10879 switch (op) {
10880 case Py_EQ:
10881 v = TEST_COND(result == 0);
10882 break;
10883 case Py_NE:
10884 v = TEST_COND(result != 0);
10885 break;
10886 case Py_LE:
10887 v = TEST_COND(result <= 0);
10888 break;
10889 case Py_GE:
10890 v = TEST_COND(result >= 0);
10891 break;
10892 case Py_LT:
10893 v = TEST_COND(result == -1);
10894 break;
10895 case Py_GT:
10896 v = TEST_COND(result == 1);
10897 break;
10898 default:
10899 PyErr_BadArgument();
10900 return NULL;
10901 }
10902 Py_INCREF(v);
10903 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010904 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010905
Brian Curtindfc80e32011-08-10 20:28:54 -050010906 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010907}
10908
Alexander Belopolsky40018472011-02-26 01:02:56 +000010909int
10910PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010911{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010912 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913 int kind1, kind2, kind;
10914 void *buf1, *buf2;
10915 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010916 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010917
10918 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010919 sub = PyUnicode_FromObject(element);
10920 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010921 PyErr_Format(PyExc_TypeError,
10922 "'in <string>' requires string as left operand, not %s",
10923 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010924 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010925 }
10926
Thomas Wouters477c8d52006-05-27 19:21:47 +000010927 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010928 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010929 Py_DECREF(sub);
10930 return -1;
10931 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010932 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10933 Py_DECREF(sub);
10934 Py_DECREF(str);
10935 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 kind1 = PyUnicode_KIND(str);
10938 kind2 = PyUnicode_KIND(sub);
10939 kind = kind1 > kind2 ? kind1 : kind2;
10940 buf1 = PyUnicode_DATA(str);
10941 buf2 = PyUnicode_DATA(sub);
10942 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010943 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 if (!buf1) {
10945 Py_DECREF(sub);
10946 return -1;
10947 }
10948 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010949 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 if (!buf2) {
10951 Py_DECREF(sub);
10952 if (kind1 != kind) PyMem_Free(buf1);
10953 return -1;
10954 }
10955 len1 = PyUnicode_GET_LENGTH(str);
10956 len2 = PyUnicode_GET_LENGTH(sub);
10957
Benjamin Petersonead6b532011-12-20 17:23:42 -060010958 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 case PyUnicode_1BYTE_KIND:
10960 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10961 break;
10962 case PyUnicode_2BYTE_KIND:
10963 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10964 break;
10965 case PyUnicode_4BYTE_KIND:
10966 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10967 break;
10968 default:
10969 result = -1;
10970 assert(0);
10971 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010972
10973 Py_DECREF(str);
10974 Py_DECREF(sub);
10975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 if (kind1 != kind)
10977 PyMem_Free(buf1);
10978 if (kind2 != kind)
10979 PyMem_Free(buf2);
10980
Guido van Rossum403d68b2000-03-13 15:55:09 +000010981 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010982}
10983
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984/* Concat to string or Unicode object giving a new Unicode object. */
10985
Alexander Belopolsky40018472011-02-26 01:02:56 +000010986PyObject *
10987PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010990 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010991 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992
10993 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010996 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000
11001 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011002 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011006 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011007 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009 }
11010
Victor Stinner488fa492011-12-12 00:01:39 +010011011 u_len = PyUnicode_GET_LENGTH(u);
11012 v_len = PyUnicode_GET_LENGTH(v);
11013 if (u_len > PY_SSIZE_T_MAX - v_len) {
11014 PyErr_SetString(PyExc_OverflowError,
11015 "strings are too large to concat");
11016 goto onError;
11017 }
11018 new_len = u_len + v_len;
11019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011021 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
11022 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011025 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010011028 copy_characters(w, 0, u, 0, u_len);
11029 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030 Py_DECREF(u);
11031 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011032 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034
Benjamin Peterson29060642009-01-31 22:14:21 +000011035 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 Py_XDECREF(u);
11037 Py_XDECREF(v);
11038 return NULL;
11039}
11040
Walter Dörwald1ab83302007-05-18 17:15:44 +000011041void
Victor Stinner23e56682011-10-03 03:54:37 +020011042PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011043{
Victor Stinner23e56682011-10-03 03:54:37 +020011044 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011045 Py_UCS4 maxchar, maxchar2;
11046 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011047
11048 if (p_left == NULL) {
11049 if (!PyErr_Occurred())
11050 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011051 return;
11052 }
Victor Stinner23e56682011-10-03 03:54:37 +020011053 left = *p_left;
11054 if (right == NULL || !PyUnicode_Check(left)) {
11055 if (!PyErr_Occurred())
11056 PyErr_BadInternalCall();
11057 goto error;
11058 }
11059
Benjamin Petersonbac79492012-01-14 13:34:47 -050011060 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011061 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011062 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011063 goto error;
11064
Victor Stinner488fa492011-12-12 00:01:39 +010011065 /* Shortcuts */
11066 if (left == unicode_empty) {
11067 Py_DECREF(left);
11068 Py_INCREF(right);
11069 *p_left = right;
11070 return;
11071 }
11072 if (right == unicode_empty)
11073 return;
11074
11075 left_len = PyUnicode_GET_LENGTH(left);
11076 right_len = PyUnicode_GET_LENGTH(right);
11077 if (left_len > PY_SSIZE_T_MAX - right_len) {
11078 PyErr_SetString(PyExc_OverflowError,
11079 "strings are too large to concat");
11080 goto error;
11081 }
11082 new_len = left_len + right_len;
11083
11084 if (unicode_modifiable(left)
11085 && PyUnicode_CheckExact(right)
11086 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011087 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11088 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011089 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011090 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011091 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11092 {
11093 /* append inplace */
11094 if (unicode_resize(p_left, new_len) != 0) {
11095 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11096 * deallocated so it cannot be put back into
11097 * 'variable'. The MemoryError is raised when there
11098 * is no value in 'variable', which might (very
11099 * remotely) be a cause of incompatibilities.
11100 */
11101 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011102 }
Victor Stinner488fa492011-12-12 00:01:39 +010011103 /* copy 'right' into the newly allocated area of 'left' */
11104 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011105 }
Victor Stinner488fa492011-12-12 00:01:39 +010011106 else {
11107 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11108 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11109 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011110
Victor Stinner488fa492011-12-12 00:01:39 +010011111 /* Concat the two Unicode strings */
11112 res = PyUnicode_New(new_len, maxchar);
11113 if (res == NULL)
11114 goto error;
11115 copy_characters(res, 0, left, 0, left_len);
11116 copy_characters(res, left_len, right, 0, right_len);
11117 Py_DECREF(left);
11118 *p_left = res;
11119 }
11120 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011121 return;
11122
11123error:
Victor Stinner488fa492011-12-12 00:01:39 +010011124 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011125}
11126
11127void
11128PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11129{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011130 PyUnicode_Append(pleft, right);
11131 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011132}
11133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011134PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011135 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011137Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011138string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011139interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140
11141static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011142unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011144 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011145 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011146 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 int kind1, kind2, kind;
11149 void *buf1, *buf2;
11150 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151
Jesus Ceaac451502011-04-20 17:09:23 +020011152 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11153 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011154 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 kind1 = PyUnicode_KIND(self);
11157 kind2 = PyUnicode_KIND(substring);
11158 kind = kind1 > kind2 ? kind1 : kind2;
11159 buf1 = PyUnicode_DATA(self);
11160 buf2 = PyUnicode_DATA(substring);
11161 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011162 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 if (!buf1) {
11164 Py_DECREF(substring);
11165 return NULL;
11166 }
11167 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011168 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 if (!buf2) {
11170 Py_DECREF(substring);
11171 if (kind1 != kind) PyMem_Free(buf1);
11172 return NULL;
11173 }
11174 len1 = PyUnicode_GET_LENGTH(self);
11175 len2 = PyUnicode_GET_LENGTH(substring);
11176
11177 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011178 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 case PyUnicode_1BYTE_KIND:
11180 iresult = ucs1lib_count(
11181 ((Py_UCS1*)buf1) + start, end - start,
11182 buf2, len2, PY_SSIZE_T_MAX
11183 );
11184 break;
11185 case PyUnicode_2BYTE_KIND:
11186 iresult = ucs2lib_count(
11187 ((Py_UCS2*)buf1) + start, end - start,
11188 buf2, len2, PY_SSIZE_T_MAX
11189 );
11190 break;
11191 case PyUnicode_4BYTE_KIND:
11192 iresult = ucs4lib_count(
11193 ((Py_UCS4*)buf1) + start, end - start,
11194 buf2, len2, PY_SSIZE_T_MAX
11195 );
11196 break;
11197 default:
11198 assert(0); iresult = 0;
11199 }
11200
11201 result = PyLong_FromSsize_t(iresult);
11202
11203 if (kind1 != kind)
11204 PyMem_Free(buf1);
11205 if (kind2 != kind)
11206 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
11208 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011209
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210 return result;
11211}
11212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011213PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011214 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011216Encode S using the codec registered for encoding. Default encoding\n\
11217is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011218handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011219a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11220'xmlcharrefreplace' as well as any other name registered with\n\
11221codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222
11223static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011224unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011226 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227 char *encoding = NULL;
11228 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011229
Benjamin Peterson308d6372009-09-18 21:42:35 +000011230 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11231 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011233 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011234}
11235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011236PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238\n\
11239Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011240If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241
11242static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011243unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011245 Py_ssize_t i, j, line_pos, src_len, incr;
11246 Py_UCS4 ch;
11247 PyObject *u;
11248 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011250 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011251 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252
11253 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255
Antoine Pitrou22425222011-10-04 19:10:51 +020011256 if (PyUnicode_READY(self) == -1)
11257 return NULL;
11258
Thomas Wouters7e474022000-07-16 12:04:32 +000011259 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011260 src_len = PyUnicode_GET_LENGTH(self);
11261 i = j = line_pos = 0;
11262 kind = PyUnicode_KIND(self);
11263 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011264 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011265 for (; i < src_len; i++) {
11266 ch = PyUnicode_READ(kind, src_data, i);
11267 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011268 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011270 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011272 goto overflow;
11273 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011274 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011275 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011276 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011278 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011279 goto overflow;
11280 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011282 if (ch == '\n' || ch == '\r')
11283 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011285 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011286 if (!found)
11287 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011288
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011290 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291 if (!u)
11292 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011293 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294
Antoine Pitroue71d5742011-10-04 15:55:09 +020011295 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296
Antoine Pitroue71d5742011-10-04 15:55:09 +020011297 for (; i < src_len; i++) {
11298 ch = PyUnicode_READ(kind, src_data, i);
11299 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011301 incr = tabsize - (line_pos % tabsize);
11302 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011303 FILL(kind, dest_data, ' ', j, incr);
11304 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011306 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011308 line_pos++;
11309 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011310 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011311 if (ch == '\n' || ch == '\r')
11312 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011314 }
11315 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011316 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011317
Antoine Pitroue71d5742011-10-04 15:55:09 +020011318 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011319 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321}
11322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011323PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011324 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325\n\
11326Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011327such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328arguments start and end are interpreted as in slice notation.\n\
11329\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011330Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331
11332static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011335 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011336 Py_ssize_t start;
11337 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011338 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339
Jesus Ceaac451502011-04-20 17:09:23 +020011340 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11341 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 if (PyUnicode_READY(self) == -1)
11345 return NULL;
11346 if (PyUnicode_READY(substring) == -1)
11347 return NULL;
11348
Victor Stinner7931d9a2011-11-04 00:22:48 +010011349 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350
11351 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 if (result == -2)
11354 return NULL;
11355
Christian Heimes217cfd12007-12-02 14:31:20 +000011356 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357}
11358
11359static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011360unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011362 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11363 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366}
11367
Guido van Rossumc2504932007-09-18 19:42:40 +000011368/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011369 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011370static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011371unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372{
Guido van Rossumc2504932007-09-18 19:42:40 +000011373 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011374 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011375
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011376#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011377 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011378#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 if (_PyUnicode_HASH(self) != -1)
11380 return _PyUnicode_HASH(self);
11381 if (PyUnicode_READY(self) == -1)
11382 return -1;
11383 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011384 /*
11385 We make the hash of the empty string be 0, rather than using
11386 (prefix ^ suffix), since this slightly obfuscates the hash secret
11387 */
11388 if (len == 0) {
11389 _PyUnicode_HASH(self) = 0;
11390 return 0;
11391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392
11393 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011394#define HASH(P) \
11395 x ^= (Py_uhash_t) *P << 7; \
11396 while (--len >= 0) \
11397 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398
Georg Brandl2fb477c2012-02-21 00:33:36 +010011399 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 switch (PyUnicode_KIND(self)) {
11401 case PyUnicode_1BYTE_KIND: {
11402 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11403 HASH(c);
11404 break;
11405 }
11406 case PyUnicode_2BYTE_KIND: {
11407 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11408 HASH(s);
11409 break;
11410 }
11411 default: {
11412 Py_UCS4 *l;
11413 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11414 "Impossible switch case in unicode_hash");
11415 l = PyUnicode_4BYTE_DATA(self);
11416 HASH(l);
11417 break;
11418 }
11419 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011420 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11421 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422
Guido van Rossumc2504932007-09-18 19:42:40 +000011423 if (x == -1)
11424 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011426 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011430PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011431 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011433Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
11435static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011438 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011439 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011440 Py_ssize_t start;
11441 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
Jesus Ceaac451502011-04-20 17:09:23 +020011443 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11444 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 if (PyUnicode_READY(self) == -1)
11448 return NULL;
11449 if (PyUnicode_READY(substring) == -1)
11450 return NULL;
11451
Victor Stinner7931d9a2011-11-04 00:22:48 +010011452 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
11454 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 if (result == -2)
11457 return NULL;
11458
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459 if (result < 0) {
11460 PyErr_SetString(PyExc_ValueError, "substring not found");
11461 return NULL;
11462 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011463
Christian Heimes217cfd12007-12-02 14:31:20 +000011464 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465}
11466
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011467PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011468 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011470Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011471at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472
11473static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011474unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 Py_ssize_t i, length;
11477 int kind;
11478 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 int cased;
11480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 if (PyUnicode_READY(self) == -1)
11482 return NULL;
11483 length = PyUnicode_GET_LENGTH(self);
11484 kind = PyUnicode_KIND(self);
11485 data = PyUnicode_DATA(self);
11486
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 if (length == 1)
11489 return PyBool_FromLong(
11490 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011492 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011495
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 for (i = 0; i < length; i++) {
11498 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011499
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11501 return PyBool_FromLong(0);
11502 else if (!cased && Py_UNICODE_ISLOWER(ch))
11503 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011505 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506}
11507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011508PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011511Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011512at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
11514static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011515unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 Py_ssize_t i, length;
11518 int kind;
11519 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520 int cased;
11521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522 if (PyUnicode_READY(self) == -1)
11523 return NULL;
11524 length = PyUnicode_GET_LENGTH(self);
11525 kind = PyUnicode_KIND(self);
11526 data = PyUnicode_DATA(self);
11527
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 if (length == 1)
11530 return PyBool_FromLong(
11531 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011533 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011535 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011536
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 for (i = 0; i < length; i++) {
11539 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011540
Benjamin Peterson29060642009-01-31 22:14:21 +000011541 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11542 return PyBool_FromLong(0);
11543 else if (!cased && Py_UNICODE_ISUPPER(ch))
11544 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011546 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547}
11548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011549PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011552Return True if S is a titlecased string and there is at least one\n\
11553character in S, i.e. upper- and titlecase characters may only\n\
11554follow uncased characters and lowercase characters only cased ones.\n\
11555Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556
11557static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011558unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 Py_ssize_t i, length;
11561 int kind;
11562 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 int cased, previous_is_cased;
11564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 if (PyUnicode_READY(self) == -1)
11566 return NULL;
11567 length = PyUnicode_GET_LENGTH(self);
11568 kind = PyUnicode_KIND(self);
11569 data = PyUnicode_DATA(self);
11570
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 if (length == 1) {
11573 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11574 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11575 (Py_UNICODE_ISUPPER(ch) != 0));
11576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011578 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011581
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582 cased = 0;
11583 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 for (i = 0; i < length; i++) {
11585 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011586
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11588 if (previous_is_cased)
11589 return PyBool_FromLong(0);
11590 previous_is_cased = 1;
11591 cased = 1;
11592 }
11593 else if (Py_UNICODE_ISLOWER(ch)) {
11594 if (!previous_is_cased)
11595 return PyBool_FromLong(0);
11596 previous_is_cased = 1;
11597 cased = 1;
11598 }
11599 else
11600 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011602 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603}
11604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011605PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011608Return True if all characters in S are whitespace\n\
11609and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610
11611static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011612unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 Py_ssize_t i, length;
11615 int kind;
11616 void *data;
11617
11618 if (PyUnicode_READY(self) == -1)
11619 return NULL;
11620 length = PyUnicode_GET_LENGTH(self);
11621 kind = PyUnicode_KIND(self);
11622 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 if (length == 1)
11626 return PyBool_FromLong(
11627 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011629 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011631 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 for (i = 0; i < length; i++) {
11634 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011635 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011638 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639}
11640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011641PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011643\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011644Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011645and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011646
11647static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011648unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 Py_ssize_t i, length;
11651 int kind;
11652 void *data;
11653
11654 if (PyUnicode_READY(self) == -1)
11655 return NULL;
11656 length = PyUnicode_GET_LENGTH(self);
11657 kind = PyUnicode_KIND(self);
11658 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011659
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011660 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 if (length == 1)
11662 return PyBool_FromLong(
11663 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011664
11665 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011667 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 for (i = 0; i < length; i++) {
11670 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011671 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011672 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011673 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011674}
11675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011676PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011678\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011679Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011680and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011681
11682static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011683unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 int kind;
11686 void *data;
11687 Py_ssize_t len, i;
11688
11689 if (PyUnicode_READY(self) == -1)
11690 return NULL;
11691
11692 kind = PyUnicode_KIND(self);
11693 data = PyUnicode_DATA(self);
11694 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011695
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011696 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 if (len == 1) {
11698 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11699 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11700 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011701
11702 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011704 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 for (i = 0; i < len; i++) {
11707 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011708 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011709 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011710 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011711 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011712}
11713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011714PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011717Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011718False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719
11720static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011721unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 Py_ssize_t i, length;
11724 int kind;
11725 void *data;
11726
11727 if (PyUnicode_READY(self) == -1)
11728 return NULL;
11729 length = PyUnicode_GET_LENGTH(self);
11730 kind = PyUnicode_KIND(self);
11731 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 if (length == 1)
11735 return PyBool_FromLong(
11736 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011738 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 for (i = 0; i < length; i++) {
11743 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011744 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011746 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747}
11748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011749PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011750 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011752Return True if all characters in S are digits\n\
11753and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754
11755static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011756unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 Py_ssize_t i, length;
11759 int kind;
11760 void *data;
11761
11762 if (PyUnicode_READY(self) == -1)
11763 return NULL;
11764 length = PyUnicode_GET_LENGTH(self);
11765 kind = PyUnicode_KIND(self);
11766 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 if (length == 1) {
11770 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11771 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11772 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011774 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 for (i = 0; i < length; i++) {
11779 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011782 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783}
11784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011785PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011788Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011789False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790
11791static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011792unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 Py_ssize_t i, length;
11795 int kind;
11796 void *data;
11797
11798 if (PyUnicode_READY(self) == -1)
11799 return NULL;
11800 length = PyUnicode_GET_LENGTH(self);
11801 kind = PyUnicode_KIND(self);
11802 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 if (length == 1)
11806 return PyBool_FromLong(
11807 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011809 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 for (i = 0; i < length; i++) {
11814 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011815 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011817 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818}
11819
Martin v. Löwis47383402007-08-15 07:32:56 +000011820int
11821PyUnicode_IsIdentifier(PyObject *self)
11822{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 int kind;
11824 void *data;
11825 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011826 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 if (PyUnicode_READY(self) == -1) {
11829 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 }
11832
11833 /* Special case for empty strings */
11834 if (PyUnicode_GET_LENGTH(self) == 0)
11835 return 0;
11836 kind = PyUnicode_KIND(self);
11837 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011838
11839 /* PEP 3131 says that the first character must be in
11840 XID_Start and subsequent characters in XID_Continue,
11841 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011842 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011843 letters, digits, underscore). However, given the current
11844 definition of XID_Start and XID_Continue, it is sufficient
11845 to check just for these, except that _ must be allowed
11846 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011848 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011849 return 0;
11850
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011851 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011853 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011854 return 1;
11855}
11856
11857PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011858 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011859\n\
11860Return True if S is a valid identifier according\n\
11861to the language definition.");
11862
11863static PyObject*
11864unicode_isidentifier(PyObject *self)
11865{
11866 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11867}
11868
Georg Brandl559e5d72008-06-11 18:37:52 +000011869PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011871\n\
11872Return True if all characters in S are considered\n\
11873printable in repr() or S is empty, False otherwise.");
11874
11875static PyObject*
11876unicode_isprintable(PyObject *self)
11877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 Py_ssize_t i, length;
11879 int kind;
11880 void *data;
11881
11882 if (PyUnicode_READY(self) == -1)
11883 return NULL;
11884 length = PyUnicode_GET_LENGTH(self);
11885 kind = PyUnicode_KIND(self);
11886 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011887
11888 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 if (length == 1)
11890 return PyBool_FromLong(
11891 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 for (i = 0; i < length; i++) {
11894 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011895 Py_RETURN_FALSE;
11896 }
11897 }
11898 Py_RETURN_TRUE;
11899}
11900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011901PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011902 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903\n\
11904Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011905iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
11907static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011908unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011910 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911}
11912
Martin v. Löwis18e16552006-02-15 17:27:45 +000011913static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011914unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 if (PyUnicode_READY(self) == -1)
11917 return -1;
11918 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919}
11920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011921PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011922 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011924Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011925done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
11927static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011928unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011930 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 Py_UCS4 fillchar = ' ';
11932
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011933 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 return NULL;
11935
Benjamin Petersonbac79492012-01-14 13:34:47 -050011936 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011937 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938
Victor Stinnerc4b49542011-12-11 22:44:26 +010011939 if (PyUnicode_GET_LENGTH(self) >= width)
11940 return unicode_result_unchanged(self);
11941
11942 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943}
11944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011945PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011948Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949
11950static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011951unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011953 if (PyUnicode_READY(self) == -1)
11954 return NULL;
11955 if (PyUnicode_IS_ASCII(self))
11956 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011957 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958}
11959
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011960#define LEFTSTRIP 0
11961#define RIGHTSTRIP 1
11962#define BOTHSTRIP 2
11963
11964/* Arrays indexed by above */
11965static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11966
11967#define STRIPNAME(i) (stripformat[i]+3)
11968
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011969/* externally visible for str.strip(unicode) */
11970PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011971_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 void *data;
11974 int kind;
11975 Py_ssize_t i, j, len;
11976 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11979 return NULL;
11980
11981 kind = PyUnicode_KIND(self);
11982 data = PyUnicode_DATA(self);
11983 len = PyUnicode_GET_LENGTH(self);
11984 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11985 PyUnicode_DATA(sepobj),
11986 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011987
Benjamin Peterson14339b62009-01-31 16:36:08 +000011988 i = 0;
11989 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 while (i < len &&
11991 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 i++;
11993 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011994 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011995
Benjamin Peterson14339b62009-01-31 16:36:08 +000011996 j = len;
11997 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 do {
11999 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 } while (j >= i &&
12001 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012003 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012004
Victor Stinner7931d9a2011-11-04 00:22:48 +010012005 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006}
12007
12008PyObject*
12009PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12010{
12011 unsigned char *data;
12012 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012013 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014
Victor Stinnerde636f32011-10-01 03:55:54 +020012015 if (PyUnicode_READY(self) == -1)
12016 return NULL;
12017
12018 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
12019
Victor Stinner12bab6d2011-10-01 01:53:49 +020012020 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010012021 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022
Victor Stinner12bab6d2011-10-01 01:53:49 +020012023 length = end - start;
12024 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012025 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026
Victor Stinnerde636f32011-10-01 03:55:54 +020012027 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012028 PyErr_SetString(PyExc_IndexError, "string index out of range");
12029 return NULL;
12030 }
12031
Victor Stinnerb9275c12011-10-05 14:01:42 +020012032 if (PyUnicode_IS_ASCII(self)) {
12033 kind = PyUnicode_KIND(self);
12034 data = PyUnicode_1BYTE_DATA(self);
12035 return unicode_fromascii(data + start, length);
12036 }
12037 else {
12038 kind = PyUnicode_KIND(self);
12039 data = PyUnicode_1BYTE_DATA(self);
12040 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012041 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012042 length);
12043 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045
12046static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012047do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 int kind;
12050 void *data;
12051 Py_ssize_t len, i, j;
12052
12053 if (PyUnicode_READY(self) == -1)
12054 return NULL;
12055
12056 kind = PyUnicode_KIND(self);
12057 data = PyUnicode_DATA(self);
12058 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012059
Benjamin Peterson14339b62009-01-31 16:36:08 +000012060 i = 0;
12061 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012063 i++;
12064 }
12065 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012066
Benjamin Peterson14339b62009-01-31 16:36:08 +000012067 j = len;
12068 if (striptype != LEFTSTRIP) {
12069 do {
12070 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012072 j++;
12073 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012074
Victor Stinner7931d9a2011-11-04 00:22:48 +010012075 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076}
12077
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012078
12079static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012080do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012081{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012082 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012083
Benjamin Peterson14339b62009-01-31 16:36:08 +000012084 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12085 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012086
Benjamin Peterson14339b62009-01-31 16:36:08 +000012087 if (sep != NULL && sep != Py_None) {
12088 if (PyUnicode_Check(sep))
12089 return _PyUnicode_XStrip(self, striptype, sep);
12090 else {
12091 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012092 "%s arg must be None or str",
12093 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012094 return NULL;
12095 }
12096 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012097
Benjamin Peterson14339b62009-01-31 16:36:08 +000012098 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012099}
12100
12101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012102PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012103 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012104\n\
12105Return a copy of the string S with leading and trailing\n\
12106whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012107If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012108
12109static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012110unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012111{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012112 if (PyTuple_GET_SIZE(args) == 0)
12113 return do_strip(self, BOTHSTRIP); /* Common case */
12114 else
12115 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116}
12117
12118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012119PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012121\n\
12122Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012123If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012124
12125static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012126unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012127{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012128 if (PyTuple_GET_SIZE(args) == 0)
12129 return do_strip(self, LEFTSTRIP); /* Common case */
12130 else
12131 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012132}
12133
12134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012135PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012137\n\
12138Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012139If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012140
12141static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012142unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012144 if (PyTuple_GET_SIZE(args) == 0)
12145 return do_strip(self, RIGHTSTRIP); /* Common case */
12146 else
12147 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012148}
12149
12150
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012152unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012154 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156
Georg Brandl222de0f2009-04-12 12:01:50 +000012157 if (len < 1) {
12158 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012159 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161
Victor Stinnerc4b49542011-12-11 22:44:26 +010012162 /* no repeat, return original string */
12163 if (len == 1)
12164 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012165
Benjamin Petersonbac79492012-01-14 13:34:47 -050012166 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 return NULL;
12168
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012169 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012170 PyErr_SetString(PyExc_OverflowError,
12171 "repeated string is too long");
12172 return NULL;
12173 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012175
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012176 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177 if (!u)
12178 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012179 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 if (PyUnicode_GET_LENGTH(str) == 1) {
12182 const int kind = PyUnicode_KIND(str);
12183 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012184 if (kind == PyUnicode_1BYTE_KIND) {
12185 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012186 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012187 }
12188 else if (kind == PyUnicode_2BYTE_KIND) {
12189 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012190 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012191 ucs2[n] = fill_char;
12192 } else {
12193 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12194 assert(kind == PyUnicode_4BYTE_KIND);
12195 for (n = 0; n < len; ++n)
12196 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 }
12199 else {
12200 /* number of characters copied this far */
12201 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012202 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 char *to = (char *) PyUnicode_DATA(u);
12204 Py_MEMCPY(to, PyUnicode_DATA(str),
12205 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012206 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 n = (done <= nchars-done) ? done : nchars-done;
12208 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012209 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211 }
12212
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012213 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012214 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215}
12216
Alexander Belopolsky40018472011-02-26 01:02:56 +000012217PyObject *
12218PyUnicode_Replace(PyObject *obj,
12219 PyObject *subobj,
12220 PyObject *replobj,
12221 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222{
12223 PyObject *self;
12224 PyObject *str1;
12225 PyObject *str2;
12226 PyObject *result;
12227
12228 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012229 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012232 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012233 Py_DECREF(self);
12234 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235 }
12236 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012237 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012238 Py_DECREF(self);
12239 Py_DECREF(str1);
12240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012242 if (PyUnicode_READY(self) == -1 ||
12243 PyUnicode_READY(str1) == -1 ||
12244 PyUnicode_READY(str2) == -1)
12245 result = NULL;
12246 else
12247 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248 Py_DECREF(self);
12249 Py_DECREF(str1);
12250 Py_DECREF(str2);
12251 return result;
12252}
12253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012254PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012255 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256\n\
12257Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012258old replaced by new. If the optional argument count is\n\
12259given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260
12261static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 PyObject *str1;
12265 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012266 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267 PyObject *result;
12268
Martin v. Löwis18e16552006-02-15 17:27:45 +000012269 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012271 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012272 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012274 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012275 return NULL;
12276 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012277 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012278 Py_DECREF(str1);
12279 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012280 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012281 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12282 result = NULL;
12283 else
12284 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285
12286 Py_DECREF(str1);
12287 Py_DECREF(str2);
12288 return result;
12289}
12290
Alexander Belopolsky40018472011-02-26 01:02:56 +000012291static PyObject *
12292unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012294 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 Py_ssize_t isize;
12296 Py_ssize_t osize, squote, dquote, i, o;
12297 Py_UCS4 max, quote;
12298 int ikind, okind;
12299 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012302 return NULL;
12303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 isize = PyUnicode_GET_LENGTH(unicode);
12305 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 /* Compute length of output, quote characters, and
12308 maximum character */
12309 osize = 2; /* quotes */
12310 max = 127;
12311 squote = dquote = 0;
12312 ikind = PyUnicode_KIND(unicode);
12313 for (i = 0; i < isize; i++) {
12314 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12315 switch (ch) {
12316 case '\'': squote++; osize++; break;
12317 case '"': dquote++; osize++; break;
12318 case '\\': case '\t': case '\r': case '\n':
12319 osize += 2; break;
12320 default:
12321 /* Fast-path ASCII */
12322 if (ch < ' ' || ch == 0x7f)
12323 osize += 4; /* \xHH */
12324 else if (ch < 0x7f)
12325 osize++;
12326 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12327 osize++;
12328 max = ch > max ? ch : max;
12329 }
12330 else if (ch < 0x100)
12331 osize += 4; /* \xHH */
12332 else if (ch < 0x10000)
12333 osize += 6; /* \uHHHH */
12334 else
12335 osize += 10; /* \uHHHHHHHH */
12336 }
12337 }
12338
12339 quote = '\'';
12340 if (squote) {
12341 if (dquote)
12342 /* Both squote and dquote present. Use squote,
12343 and escape them */
12344 osize += squote;
12345 else
12346 quote = '"';
12347 }
12348
12349 repr = PyUnicode_New(osize, max);
12350 if (repr == NULL)
12351 return NULL;
12352 okind = PyUnicode_KIND(repr);
12353 odata = PyUnicode_DATA(repr);
12354
12355 PyUnicode_WRITE(okind, odata, 0, quote);
12356 PyUnicode_WRITE(okind, odata, osize-1, quote);
12357
12358 for (i = 0, o = 1; i < isize; i++) {
12359 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012360
12361 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 if ((ch == quote) || (ch == '\\')) {
12363 PyUnicode_WRITE(okind, odata, o++, '\\');
12364 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012365 continue;
12366 }
12367
Benjamin Peterson29060642009-01-31 22:14:21 +000012368 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012369 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 PyUnicode_WRITE(okind, odata, o++, '\\');
12371 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012372 }
12373 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 PyUnicode_WRITE(okind, odata, o++, '\\');
12375 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012376 }
12377 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 PyUnicode_WRITE(okind, odata, o++, '\\');
12379 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012380 }
12381
12382 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012383 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 PyUnicode_WRITE(okind, odata, o++, '\\');
12385 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012386 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12387 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012388 }
12389
Georg Brandl559e5d72008-06-11 18:37:52 +000012390 /* Copy ASCII characters as-is */
12391 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012393 }
12394
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012396 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012397 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012398 (categories Z* and C* except ASCII space)
12399 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012401 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 if (ch <= 0xff) {
12403 PyUnicode_WRITE(okind, odata, o++, '\\');
12404 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012405 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12406 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012407 }
12408 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 else if (ch >= 0x10000) {
12410 PyUnicode_WRITE(okind, odata, o++, '\\');
12411 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012412 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12413 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12414 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12415 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12416 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12417 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12418 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12419 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012420 }
12421 /* Map 16-bit characters to '\uxxxx' */
12422 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 PyUnicode_WRITE(okind, odata, o++, '\\');
12424 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012425 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12426 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12427 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12428 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012429 }
12430 }
12431 /* Copy characters as-is */
12432 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012434 }
12435 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012438 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012439 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440}
12441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012442PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012443 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444\n\
12445Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012446such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447arguments start and end are interpreted as in slice notation.\n\
12448\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012449Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450
12451static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012454 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012455 Py_ssize_t start;
12456 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012457 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458
Jesus Ceaac451502011-04-20 17:09:23 +020012459 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12460 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012461 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 if (PyUnicode_READY(self) == -1)
12464 return NULL;
12465 if (PyUnicode_READY(substring) == -1)
12466 return NULL;
12467
Victor Stinner7931d9a2011-11-04 00:22:48 +010012468 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469
12470 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 if (result == -2)
12473 return NULL;
12474
Christian Heimes217cfd12007-12-02 14:31:20 +000012475 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476}
12477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012478PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012479 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012480\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012481Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482
12483static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012484unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012486 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012487 Py_ssize_t start;
12488 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012489 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
Jesus Ceaac451502011-04-20 17:09:23 +020012491 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12492 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012493 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 if (PyUnicode_READY(self) == -1)
12496 return NULL;
12497 if (PyUnicode_READY(substring) == -1)
12498 return NULL;
12499
Victor Stinner7931d9a2011-11-04 00:22:48 +010012500 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501
12502 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 if (result == -2)
12505 return NULL;
12506
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507 if (result < 0) {
12508 PyErr_SetString(PyExc_ValueError, "substring not found");
12509 return NULL;
12510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511
Christian Heimes217cfd12007-12-02 14:31:20 +000012512 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513}
12514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012515PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012516 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012518Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012519done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520
12521static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012522unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012524 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 Py_UCS4 fillchar = ' ';
12526
Victor Stinnere9a29352011-10-01 02:14:59 +020012527 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012529
Benjamin Petersonbac79492012-01-14 13:34:47 -050012530 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531 return NULL;
12532
Victor Stinnerc4b49542011-12-11 22:44:26 +010012533 if (PyUnicode_GET_LENGTH(self) >= width)
12534 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535
Victor Stinnerc4b49542011-12-11 22:44:26 +010012536 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537}
12538
Alexander Belopolsky40018472011-02-26 01:02:56 +000012539PyObject *
12540PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541{
12542 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012543
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544 s = PyUnicode_FromObject(s);
12545 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012546 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012547 if (sep != NULL) {
12548 sep = PyUnicode_FromObject(sep);
12549 if (sep == NULL) {
12550 Py_DECREF(s);
12551 return NULL;
12552 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553 }
12554
Victor Stinner9310abb2011-10-05 00:59:23 +020012555 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556
12557 Py_DECREF(s);
12558 Py_XDECREF(sep);
12559 return result;
12560}
12561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012562PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012563 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564\n\
12565Return a list of the words in S, using sep as the\n\
12566delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012567splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012568whitespace string is a separator and empty strings are\n\
12569removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570
12571static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012572unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012574 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012576 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012578 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12579 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580 return NULL;
12581
12582 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012583 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012585 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012587 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588}
12589
Thomas Wouters477c8d52006-05-27 19:21:47 +000012590PyObject *
12591PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12592{
12593 PyObject* str_obj;
12594 PyObject* sep_obj;
12595 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596 int kind1, kind2, kind;
12597 void *buf1 = NULL, *buf2 = NULL;
12598 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012599
12600 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012601 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012602 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012603 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012604 if (!sep_obj) {
12605 Py_DECREF(str_obj);
12606 return NULL;
12607 }
12608 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12609 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012610 Py_DECREF(str_obj);
12611 return NULL;
12612 }
12613
Victor Stinner14f8f022011-10-05 20:58:25 +020012614 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012616 kind = Py_MAX(kind1, kind2);
12617 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012619 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 if (!buf1)
12621 goto onError;
12622 buf2 = PyUnicode_DATA(sep_obj);
12623 if (kind2 != kind)
12624 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12625 if (!buf2)
12626 goto onError;
12627 len1 = PyUnicode_GET_LENGTH(str_obj);
12628 len2 = PyUnicode_GET_LENGTH(sep_obj);
12629
Benjamin Petersonead6b532011-12-20 17:23:42 -060012630 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012632 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12633 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12634 else
12635 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 break;
12637 case PyUnicode_2BYTE_KIND:
12638 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12639 break;
12640 case PyUnicode_4BYTE_KIND:
12641 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12642 break;
12643 default:
12644 assert(0);
12645 out = 0;
12646 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012647
12648 Py_DECREF(sep_obj);
12649 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 if (kind1 != kind)
12651 PyMem_Free(buf1);
12652 if (kind2 != kind)
12653 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012654
12655 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 onError:
12657 Py_DECREF(sep_obj);
12658 Py_DECREF(str_obj);
12659 if (kind1 != kind && buf1)
12660 PyMem_Free(buf1);
12661 if (kind2 != kind && buf2)
12662 PyMem_Free(buf2);
12663 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012664}
12665
12666
12667PyObject *
12668PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12669{
12670 PyObject* str_obj;
12671 PyObject* sep_obj;
12672 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 int kind1, kind2, kind;
12674 void *buf1 = NULL, *buf2 = NULL;
12675 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012676
12677 str_obj = PyUnicode_FromObject(str_in);
12678 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012679 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012680 sep_obj = PyUnicode_FromObject(sep_in);
12681 if (!sep_obj) {
12682 Py_DECREF(str_obj);
12683 return NULL;
12684 }
12685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686 kind1 = PyUnicode_KIND(str_in);
12687 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012688 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 buf1 = PyUnicode_DATA(str_in);
12690 if (kind1 != kind)
12691 buf1 = _PyUnicode_AsKind(str_in, kind);
12692 if (!buf1)
12693 goto onError;
12694 buf2 = PyUnicode_DATA(sep_obj);
12695 if (kind2 != kind)
12696 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12697 if (!buf2)
12698 goto onError;
12699 len1 = PyUnicode_GET_LENGTH(str_obj);
12700 len2 = PyUnicode_GET_LENGTH(sep_obj);
12701
Benjamin Petersonead6b532011-12-20 17:23:42 -060012702 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012704 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12705 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12706 else
12707 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 break;
12709 case PyUnicode_2BYTE_KIND:
12710 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12711 break;
12712 case PyUnicode_4BYTE_KIND:
12713 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12714 break;
12715 default:
12716 assert(0);
12717 out = 0;
12718 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012719
12720 Py_DECREF(sep_obj);
12721 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 if (kind1 != kind)
12723 PyMem_Free(buf1);
12724 if (kind2 != kind)
12725 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012726
12727 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 onError:
12729 Py_DECREF(sep_obj);
12730 Py_DECREF(str_obj);
12731 if (kind1 != kind && buf1)
12732 PyMem_Free(buf1);
12733 if (kind2 != kind && buf2)
12734 PyMem_Free(buf2);
12735 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012736}
12737
12738PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012739 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012740\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012741Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012742the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012743found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012744
12745static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012746unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012747{
Victor Stinner9310abb2011-10-05 00:59:23 +020012748 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012749}
12750
12751PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012752 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012753\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012754Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012755the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012756separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012757
12758static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012759unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012760{
Victor Stinner9310abb2011-10-05 00:59:23 +020012761 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012762}
12763
Alexander Belopolsky40018472011-02-26 01:02:56 +000012764PyObject *
12765PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012766{
12767 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012768
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012769 s = PyUnicode_FromObject(s);
12770 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012771 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 if (sep != NULL) {
12773 sep = PyUnicode_FromObject(sep);
12774 if (sep == NULL) {
12775 Py_DECREF(s);
12776 return NULL;
12777 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012778 }
12779
Victor Stinner9310abb2011-10-05 00:59:23 +020012780 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012781
12782 Py_DECREF(s);
12783 Py_XDECREF(sep);
12784 return result;
12785}
12786
12787PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012788 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012789\n\
12790Return a list of the words in S, using sep as the\n\
12791delimiter string, starting at the end of the string and\n\
12792working to the front. If maxsplit is given, at most maxsplit\n\
12793splits are done. If sep is not specified, any whitespace string\n\
12794is a separator.");
12795
12796static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012797unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012798{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012799 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012800 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012801 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012802
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012803 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12804 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012805 return NULL;
12806
12807 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012808 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012809 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012810 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012811 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012812 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012813}
12814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012815PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012816 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817\n\
12818Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012819Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012820is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821
12822static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012823unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012825 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012826 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012828 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12829 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830 return NULL;
12831
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012832 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833}
12834
12835static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012836PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012838 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839}
12840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012841PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012842 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843\n\
12844Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012845and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846
12847static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012848unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012850 if (PyUnicode_READY(self) == -1)
12851 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012852 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853}
12854
Georg Brandlceee0772007-11-27 23:48:05 +000012855PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012856 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012857\n\
12858Return a translation table usable for str.translate().\n\
12859If there is only one argument, it must be a dictionary mapping Unicode\n\
12860ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012861Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012862If there are two arguments, they must be strings of equal length, and\n\
12863in the resulting dictionary, each character in x will be mapped to the\n\
12864character at the same position in y. If there is a third argument, it\n\
12865must be a string, whose characters will be mapped to None in the result.");
12866
12867static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012868unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012869{
12870 PyObject *x, *y = NULL, *z = NULL;
12871 PyObject *new = NULL, *key, *value;
12872 Py_ssize_t i = 0;
12873 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012874
Georg Brandlceee0772007-11-27 23:48:05 +000012875 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12876 return NULL;
12877 new = PyDict_New();
12878 if (!new)
12879 return NULL;
12880 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 int x_kind, y_kind, z_kind;
12882 void *x_data, *y_data, *z_data;
12883
Georg Brandlceee0772007-11-27 23:48:05 +000012884 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012885 if (!PyUnicode_Check(x)) {
12886 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12887 "be a string if there is a second argument");
12888 goto err;
12889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012890 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012891 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12892 "arguments must have equal length");
12893 goto err;
12894 }
12895 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012896 x_kind = PyUnicode_KIND(x);
12897 y_kind = PyUnicode_KIND(y);
12898 x_data = PyUnicode_DATA(x);
12899 y_data = PyUnicode_DATA(y);
12900 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12901 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012902 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012903 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012904 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012905 if (!value) {
12906 Py_DECREF(key);
12907 goto err;
12908 }
Georg Brandlceee0772007-11-27 23:48:05 +000012909 res = PyDict_SetItem(new, key, value);
12910 Py_DECREF(key);
12911 Py_DECREF(value);
12912 if (res < 0)
12913 goto err;
12914 }
12915 /* create entries for deleting chars in z */
12916 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012917 z_kind = PyUnicode_KIND(z);
12918 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012919 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012920 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012921 if (!key)
12922 goto err;
12923 res = PyDict_SetItem(new, key, Py_None);
12924 Py_DECREF(key);
12925 if (res < 0)
12926 goto err;
12927 }
12928 }
12929 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 int kind;
12931 void *data;
12932
Georg Brandlceee0772007-11-27 23:48:05 +000012933 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012934 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012935 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12936 "to maketrans it must be a dict");
12937 goto err;
12938 }
12939 /* copy entries into the new dict, converting string keys to int keys */
12940 while (PyDict_Next(x, &i, &key, &value)) {
12941 if (PyUnicode_Check(key)) {
12942 /* convert string keys to integer keys */
12943 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012944 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012945 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12946 "table must be of length 1");
12947 goto err;
12948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 kind = PyUnicode_KIND(key);
12950 data = PyUnicode_DATA(key);
12951 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012952 if (!newkey)
12953 goto err;
12954 res = PyDict_SetItem(new, newkey, value);
12955 Py_DECREF(newkey);
12956 if (res < 0)
12957 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012958 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012959 /* just keep integer keys */
12960 if (PyDict_SetItem(new, key, value) < 0)
12961 goto err;
12962 } else {
12963 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12964 "be strings or integers");
12965 goto err;
12966 }
12967 }
12968 }
12969 return new;
12970 err:
12971 Py_DECREF(new);
12972 return NULL;
12973}
12974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012975PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012976 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012977\n\
12978Return a copy of the string S, where all characters have been mapped\n\
12979through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012980Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012981Unmapped characters are left untouched. Characters mapped to None\n\
12982are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983
12984static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012987 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012988}
12989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012990PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012991 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012993Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012994
12995static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012996unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012997{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012998 if (PyUnicode_READY(self) == -1)
12999 return NULL;
13000 if (PyUnicode_IS_ASCII(self))
13001 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013002 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013003}
13004
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013005PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013006 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013007\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013008Pad a numeric string S with zeros on the left, to fill a field\n\
13009of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013010
13011static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013012unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013013{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013014 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013015 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013016 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017 int kind;
13018 void *data;
13019 Py_UCS4 chr;
13020
Martin v. Löwis18e16552006-02-15 17:27:45 +000013021 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022 return NULL;
13023
Benjamin Petersonbac79492012-01-14 13:34:47 -050013024 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013026
Victor Stinnerc4b49542011-12-11 22:44:26 +010013027 if (PyUnicode_GET_LENGTH(self) >= width)
13028 return unicode_result_unchanged(self);
13029
13030 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013031
13032 u = pad(self, fill, 0, '0');
13033
Walter Dörwald068325e2002-04-15 13:36:47 +000013034 if (u == NULL)
13035 return NULL;
13036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 kind = PyUnicode_KIND(u);
13038 data = PyUnicode_DATA(u);
13039 chr = PyUnicode_READ(kind, data, fill);
13040
13041 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013042 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013043 PyUnicode_WRITE(kind, data, 0, chr);
13044 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013045 }
13046
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013047 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013048 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050
13051#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013052static PyObject *
13053unicode__decimal2ascii(PyObject *self)
13054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013055 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013056}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057#endif
13058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013059PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013060 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013061\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013062Return True if S starts with the specified prefix, False otherwise.\n\
13063With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013064With optional end, stop comparing S at that position.\n\
13065prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066
13067static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013068unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013069 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013071 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013072 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013073 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013074 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013075 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076
Jesus Ceaac451502011-04-20 17:09:23 +020013077 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013078 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013079 if (PyTuple_Check(subobj)) {
13080 Py_ssize_t i;
13081 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013082 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013083 if (substring == NULL)
13084 return NULL;
13085 result = tailmatch(self, substring, start, end, -1);
13086 Py_DECREF(substring);
13087 if (result) {
13088 Py_RETURN_TRUE;
13089 }
13090 }
13091 /* nothing matched */
13092 Py_RETURN_FALSE;
13093 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013094 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013095 if (substring == NULL) {
13096 if (PyErr_ExceptionMatches(PyExc_TypeError))
13097 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13098 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013099 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013100 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013101 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013103 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104}
13105
13106
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013107PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013110Return True if S ends with the specified suffix, False otherwise.\n\
13111With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013112With optional end, stop comparing S at that position.\n\
13113suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114
13115static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013116unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013117 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013119 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013120 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013121 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013122 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013123 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124
Jesus Ceaac451502011-04-20 17:09:23 +020013125 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013126 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013127 if (PyTuple_Check(subobj)) {
13128 Py_ssize_t i;
13129 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013130 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013132 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013134 result = tailmatch(self, substring, start, end, +1);
13135 Py_DECREF(substring);
13136 if (result) {
13137 Py_RETURN_TRUE;
13138 }
13139 }
13140 Py_RETURN_FALSE;
13141 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013142 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013143 if (substring == NULL) {
13144 if (PyErr_ExceptionMatches(PyExc_TypeError))
13145 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13146 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013148 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013149 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013151 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152}
13153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013154#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013155
13156PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013158\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013159Return a formatted version of S, using substitutions from args and kwargs.\n\
13160The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013161
Eric Smith27bbca62010-11-04 17:06:58 +000013162PyDoc_STRVAR(format_map__doc__,
13163 "S.format_map(mapping) -> str\n\
13164\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013165Return a formatted version of S, using substitutions from mapping.\n\
13166The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013167
Eric Smith4a7d76d2008-05-30 18:10:19 +000013168static PyObject *
13169unicode__format__(PyObject* self, PyObject* args)
13170{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013171 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013172
13173 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13174 return NULL;
13175
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013176 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013178 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013179}
13180
Eric Smith8c663262007-08-25 02:26:07 +000013181PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013183\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013184Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013185
13186static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013187unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013188{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013189 Py_ssize_t size;
13190
13191 /* If it's a compact object, account for base structure +
13192 character data. */
13193 if (PyUnicode_IS_COMPACT_ASCII(v))
13194 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13195 else if (PyUnicode_IS_COMPACT(v))
13196 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013197 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 else {
13199 /* If it is a two-block object, account for base object, and
13200 for character block if present. */
13201 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013202 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013204 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 }
13206 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013207 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013208 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013209 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013210 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013211 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013212
13213 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013214}
13215
13216PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013217 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013218
13219static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013220unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013221{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013222 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013223 if (!copy)
13224 return NULL;
13225 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013226}
13227
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013229 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013230 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013231 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13232 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013233 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13234 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013235 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013236 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13237 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13238 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13239 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13240 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013241 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013242 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13243 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13244 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013245 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013246 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13247 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13248 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013249 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013250 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013251 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013252 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013253 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13254 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13255 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13256 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13257 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13258 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13259 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13260 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13261 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13262 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13263 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13264 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13265 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13266 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013267 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013268 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013269 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013270 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013271 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013272 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013273 {"maketrans", (PyCFunction) unicode_maketrans,
13274 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013275 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013276#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013277 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013278 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013279#endif
13280
Benjamin Peterson14339b62009-01-31 16:36:08 +000013281 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013282 {NULL, NULL}
13283};
13284
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013285static PyObject *
13286unicode_mod(PyObject *v, PyObject *w)
13287{
Brian Curtindfc80e32011-08-10 20:28:54 -050013288 if (!PyUnicode_Check(v))
13289 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013290 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013291}
13292
13293static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013294 0, /*nb_add*/
13295 0, /*nb_subtract*/
13296 0, /*nb_multiply*/
13297 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013298};
13299
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013301 (lenfunc) unicode_length, /* sq_length */
13302 PyUnicode_Concat, /* sq_concat */
13303 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13304 (ssizeargfunc) unicode_getitem, /* sq_item */
13305 0, /* sq_slice */
13306 0, /* sq_ass_item */
13307 0, /* sq_ass_slice */
13308 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309};
13310
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013311static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013312unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013313{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013314 if (PyUnicode_READY(self) == -1)
13315 return NULL;
13316
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013317 if (PyIndex_Check(item)) {
13318 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013319 if (i == -1 && PyErr_Occurred())
13320 return NULL;
13321 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013323 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013324 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013325 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013326 PyObject *result;
13327 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013328 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013329 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013332 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013333 return NULL;
13334 }
13335
13336 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013337 Py_INCREF(unicode_empty);
13338 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013339 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013340 slicelength == PyUnicode_GET_LENGTH(self)) {
13341 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013342 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013343 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013344 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013345 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013346 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013347 src_kind = PyUnicode_KIND(self);
13348 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013349 if (!PyUnicode_IS_ASCII(self)) {
13350 kind_limit = kind_maxchar_limit(src_kind);
13351 max_char = 0;
13352 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13353 ch = PyUnicode_READ(src_kind, src_data, cur);
13354 if (ch > max_char) {
13355 max_char = ch;
13356 if (max_char >= kind_limit)
13357 break;
13358 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013359 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013360 }
Victor Stinner55c99112011-10-13 01:17:06 +020013361 else
13362 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013363 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013364 if (result == NULL)
13365 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013366 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013367 dest_data = PyUnicode_DATA(result);
13368
13369 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013370 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13371 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013373 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013374 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013375 } else {
13376 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13377 return NULL;
13378 }
13379}
13380
13381static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013382 (lenfunc)unicode_length, /* mp_length */
13383 (binaryfunc)unicode_subscript, /* mp_subscript */
13384 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013385};
13386
Guido van Rossumd57fd912000-03-10 22:53:23 +000013387
Guido van Rossumd57fd912000-03-10 22:53:23 +000013388/* Helpers for PyUnicode_Format() */
13389
13390static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013391getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013392{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013393 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013394 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 (*p_argidx)++;
13396 if (arglen < 0)
13397 return args;
13398 else
13399 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013400 }
13401 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013402 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013403 return NULL;
13404}
13405
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013406/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013407
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013408static PyObject *
13409formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013410{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013411 char *p;
13412 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013413 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013414
Guido van Rossumd57fd912000-03-10 22:53:23 +000013415 x = PyFloat_AsDouble(v);
13416 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013417 return NULL;
13418
Guido van Rossumd57fd912000-03-10 22:53:23 +000013419 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013420 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013421
Eric Smith0923d1d2009-04-16 20:16:10 +000013422 p = PyOS_double_to_string(x, type, prec,
13423 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013424 if (p == NULL)
13425 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013426 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013427 PyMem_Free(p);
13428 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013429}
13430
Tim Peters38fd5b62000-09-21 05:43:11 +000013431static PyObject*
13432formatlong(PyObject *val, int flags, int prec, int type)
13433{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013434 char *buf;
13435 int len;
13436 PyObject *str; /* temporary string object. */
13437 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013438
Benjamin Peterson14339b62009-01-31 16:36:08 +000013439 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13440 if (!str)
13441 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013442 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013443 Py_DECREF(str);
13444 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013445}
13446
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013447static Py_UCS4
13448formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013450 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013451 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013452 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013453 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013454 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013455 goto onError;
13456 }
13457 else {
13458 /* Integer input truncated to a character */
13459 long x;
13460 x = PyLong_AsLong(v);
13461 if (x == -1 && PyErr_Occurred())
13462 goto onError;
13463
Victor Stinner8faf8212011-12-08 22:14:11 +010013464 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013465 PyErr_SetString(PyExc_OverflowError,
13466 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013467 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 }
13469
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013470 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013471 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013472
Benjamin Peterson29060642009-01-31 22:14:21 +000013473 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013474 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013476 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013477}
13478
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013479static int
13480repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13481{
13482 int r;
13483 assert(count > 0);
13484 assert(PyUnicode_Check(obj));
13485 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013486 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013487 if (repeated == NULL)
13488 return -1;
13489 r = _PyAccu_Accumulate(acc, repeated);
13490 Py_DECREF(repeated);
13491 return r;
13492 }
13493 else {
13494 do {
13495 if (_PyAccu_Accumulate(acc, obj))
13496 return -1;
13497 } while (--count);
13498 return 0;
13499 }
13500}
13501
Alexander Belopolsky40018472011-02-26 01:02:56 +000013502PyObject *
13503PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013504{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013505 void *fmt;
13506 int fmtkind;
13507 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013508 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013509 int r;
13510 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013511 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013512 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013513 PyObject *temp = NULL;
13514 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013515 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013516 _PyAccu acc;
13517 static PyObject *plus, *minus, *blank, *zero, *percent;
13518
13519 if (!plus && !(plus = get_latin1_char('+')))
13520 return NULL;
13521 if (!minus && !(minus = get_latin1_char('-')))
13522 return NULL;
13523 if (!blank && !(blank = get_latin1_char(' ')))
13524 return NULL;
13525 if (!zero && !(zero = get_latin1_char('0')))
13526 return NULL;
13527 if (!percent && !(percent = get_latin1_char('%')))
13528 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013529
Guido van Rossumd57fd912000-03-10 22:53:23 +000013530 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013531 PyErr_BadInternalCall();
13532 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013533 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013534 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013535 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013536 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013537 if (PyUnicode_READY(uformat) == -1)
13538 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013539 if (_PyAccu_Init(&acc))
13540 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013541 fmt = PyUnicode_DATA(uformat);
13542 fmtkind = PyUnicode_KIND(uformat);
13543 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13544 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013545
Guido van Rossumd57fd912000-03-10 22:53:23 +000013546 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013547 arglen = PyTuple_Size(args);
13548 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013549 }
13550 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013551 arglen = -1;
13552 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013553 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013554 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013555 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013557
13558 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013559 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013560 PyObject *nonfmt;
13561 Py_ssize_t nonfmtpos;
13562 nonfmtpos = fmtpos++;
13563 while (fmtcnt >= 0 &&
13564 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13565 fmtpos++;
13566 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013567 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013568 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013569 if (nonfmt == NULL)
13570 goto onError;
13571 r = _PyAccu_Accumulate(&acc, nonfmt);
13572 Py_DECREF(nonfmt);
13573 if (r)
13574 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013575 }
13576 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 /* Got a format specifier */
13578 int flags = 0;
13579 Py_ssize_t width = -1;
13580 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013581 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013582 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013583 int isnumok;
13584 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013585 void *pbuf = NULL;
13586 Py_ssize_t pindex, len;
13587 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013589 fmtpos++;
13590 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13591 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013592 Py_ssize_t keylen;
13593 PyObject *key;
13594 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013595
Benjamin Peterson29060642009-01-31 22:14:21 +000013596 if (dict == NULL) {
13597 PyErr_SetString(PyExc_TypeError,
13598 "format requires a mapping");
13599 goto onError;
13600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013601 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013602 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013603 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013604 /* Skip over balanced parentheses */
13605 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013606 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013608 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013610 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013612 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013613 if (fmtcnt < 0 || pcount > 0) {
13614 PyErr_SetString(PyExc_ValueError,
13615 "incomplete format key");
13616 goto onError;
13617 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013618 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013619 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013620 if (key == NULL)
13621 goto onError;
13622 if (args_owned) {
13623 Py_DECREF(args);
13624 args_owned = 0;
13625 }
13626 args = PyObject_GetItem(dict, key);
13627 Py_DECREF(key);
13628 if (args == NULL) {
13629 goto onError;
13630 }
13631 args_owned = 1;
13632 arglen = -1;
13633 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013634 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013635 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013636 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013637 case '-': flags |= F_LJUST; continue;
13638 case '+': flags |= F_SIGN; continue;
13639 case ' ': flags |= F_BLANK; continue;
13640 case '#': flags |= F_ALT; continue;
13641 case '0': flags |= F_ZERO; continue;
13642 }
13643 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013644 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013645 if (c == '*') {
13646 v = getnextarg(args, arglen, &argidx);
13647 if (v == NULL)
13648 goto onError;
13649 if (!PyLong_Check(v)) {
13650 PyErr_SetString(PyExc_TypeError,
13651 "* wants int");
13652 goto onError;
13653 }
13654 width = PyLong_AsLong(v);
13655 if (width == -1 && PyErr_Occurred())
13656 goto onError;
13657 if (width < 0) {
13658 flags |= F_LJUST;
13659 width = -width;
13660 }
13661 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013662 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013663 }
13664 else if (c >= '0' && c <= '9') {
13665 width = c - '0';
13666 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013667 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013668 if (c < '0' || c > '9')
13669 break;
13670 if ((width*10) / 10 != width) {
13671 PyErr_SetString(PyExc_ValueError,
13672 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013673 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013674 }
13675 width = width*10 + (c - '0');
13676 }
13677 }
13678 if (c == '.') {
13679 prec = 0;
13680 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013681 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013682 if (c == '*') {
13683 v = getnextarg(args, arglen, &argidx);
13684 if (v == NULL)
13685 goto onError;
13686 if (!PyLong_Check(v)) {
13687 PyErr_SetString(PyExc_TypeError,
13688 "* wants int");
13689 goto onError;
13690 }
13691 prec = PyLong_AsLong(v);
13692 if (prec == -1 && PyErr_Occurred())
13693 goto onError;
13694 if (prec < 0)
13695 prec = 0;
13696 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013697 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013698 }
13699 else if (c >= '0' && c <= '9') {
13700 prec = c - '0';
13701 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013702 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013703 if (c < '0' || c > '9')
13704 break;
13705 if ((prec*10) / 10 != prec) {
13706 PyErr_SetString(PyExc_ValueError,
13707 "prec too big");
13708 goto onError;
13709 }
13710 prec = prec*10 + (c - '0');
13711 }
13712 }
13713 } /* prec */
13714 if (fmtcnt >= 0) {
13715 if (c == 'h' || c == 'l' || c == 'L') {
13716 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013717 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013718 }
13719 }
13720 if (fmtcnt < 0) {
13721 PyErr_SetString(PyExc_ValueError,
13722 "incomplete format");
13723 goto onError;
13724 }
13725 if (c != '%') {
13726 v = getnextarg(args, arglen, &argidx);
13727 if (v == NULL)
13728 goto onError;
13729 }
13730 sign = 0;
13731 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013732 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013733 switch (c) {
13734
13735 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013736 _PyAccu_Accumulate(&acc, percent);
13737 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013738
13739 case 's':
13740 case 'r':
13741 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013742 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013743 temp = v;
13744 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013745 }
13746 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013747 if (c == 's')
13748 temp = PyObject_Str(v);
13749 else if (c == 'r')
13750 temp = PyObject_Repr(v);
13751 else
13752 temp = PyObject_ASCII(v);
13753 if (temp == NULL)
13754 goto onError;
13755 if (PyUnicode_Check(temp))
13756 /* nothing to do */;
13757 else {
13758 Py_DECREF(temp);
13759 PyErr_SetString(PyExc_TypeError,
13760 "%s argument has non-string str()");
13761 goto onError;
13762 }
13763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013764 if (PyUnicode_READY(temp) == -1) {
13765 Py_CLEAR(temp);
13766 goto onError;
13767 }
13768 pbuf = PyUnicode_DATA(temp);
13769 kind = PyUnicode_KIND(temp);
13770 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013771 if (prec >= 0 && len > prec)
13772 len = prec;
13773 break;
13774
13775 case 'i':
13776 case 'd':
13777 case 'u':
13778 case 'o':
13779 case 'x':
13780 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013781 isnumok = 0;
13782 if (PyNumber_Check(v)) {
13783 PyObject *iobj=NULL;
13784
13785 if (PyLong_Check(v)) {
13786 iobj = v;
13787 Py_INCREF(iobj);
13788 }
13789 else {
13790 iobj = PyNumber_Long(v);
13791 }
13792 if (iobj!=NULL) {
13793 if (PyLong_Check(iobj)) {
13794 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013795 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013796 Py_DECREF(iobj);
13797 if (!temp)
13798 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013799 if (PyUnicode_READY(temp) == -1) {
13800 Py_CLEAR(temp);
13801 goto onError;
13802 }
13803 pbuf = PyUnicode_DATA(temp);
13804 kind = PyUnicode_KIND(temp);
13805 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013806 sign = 1;
13807 }
13808 else {
13809 Py_DECREF(iobj);
13810 }
13811 }
13812 }
13813 if (!isnumok) {
13814 PyErr_Format(PyExc_TypeError,
13815 "%%%c format: a number is required, "
13816 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13817 goto onError;
13818 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013819 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013820 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013821 fillobj = zero;
13822 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013823 break;
13824
13825 case 'e':
13826 case 'E':
13827 case 'f':
13828 case 'F':
13829 case 'g':
13830 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013831 temp = formatfloat(v, flags, prec, c);
13832 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013833 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013834 if (PyUnicode_READY(temp) == -1) {
13835 Py_CLEAR(temp);
13836 goto onError;
13837 }
13838 pbuf = PyUnicode_DATA(temp);
13839 kind = PyUnicode_KIND(temp);
13840 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013841 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013842 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013843 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013844 fillobj = zero;
13845 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013846 break;
13847
13848 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013849 {
13850 Py_UCS4 ch = formatchar(v);
13851 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013852 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013853 temp = _PyUnicode_FromUCS4(&ch, 1);
13854 if (temp == NULL)
13855 goto onError;
13856 pbuf = PyUnicode_DATA(temp);
13857 kind = PyUnicode_KIND(temp);
13858 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013859 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013860 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013861
13862 default:
13863 PyErr_Format(PyExc_ValueError,
13864 "unsupported format character '%c' (0x%x) "
13865 "at index %zd",
13866 (31<=c && c<=126) ? (char)c : '?',
13867 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013868 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013869 goto onError;
13870 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013871 /* pbuf is initialized here. */
13872 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013873 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013874 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13875 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013876 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013877 pindex++;
13878 }
13879 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13880 signobj = plus;
13881 len--;
13882 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013883 }
13884 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013885 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013886 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013887 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013888 else
13889 sign = 0;
13890 }
13891 if (width < len)
13892 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013893 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013894 if (fill != ' ') {
13895 assert(signobj != NULL);
13896 if (_PyAccu_Accumulate(&acc, signobj))
13897 goto onError;
13898 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013899 if (width > len)
13900 width--;
13901 }
13902 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013903 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013904 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013905 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013906 second = get_latin1_char(
13907 PyUnicode_READ(kind, pbuf, pindex + 1));
13908 pindex += 2;
13909 if (second == NULL ||
13910 _PyAccu_Accumulate(&acc, zero) ||
13911 _PyAccu_Accumulate(&acc, second))
13912 goto onError;
13913 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013914 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013915 width -= 2;
13916 if (width < 0)
13917 width = 0;
13918 len -= 2;
13919 }
13920 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013921 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013922 if (repeat_accumulate(&acc, fillobj, width - len))
13923 goto onError;
13924 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013925 }
13926 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013927 if (sign) {
13928 assert(signobj != NULL);
13929 if (_PyAccu_Accumulate(&acc, signobj))
13930 goto onError;
13931 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013933 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13934 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013935 second = get_latin1_char(
13936 PyUnicode_READ(kind, pbuf, pindex + 1));
13937 pindex += 2;
13938 if (second == NULL ||
13939 _PyAccu_Accumulate(&acc, zero) ||
13940 _PyAccu_Accumulate(&acc, second))
13941 goto onError;
13942 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013943 }
13944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013945 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013946 if (temp != NULL) {
13947 assert(pbuf == PyUnicode_DATA(temp));
13948 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013949 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013950 else {
13951 const char *p = (const char *) pbuf;
13952 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013953 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013954 v = PyUnicode_FromKindAndData(kind, p, len);
13955 }
13956 if (v == NULL)
13957 goto onError;
13958 r = _PyAccu_Accumulate(&acc, v);
13959 Py_DECREF(v);
13960 if (r)
13961 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013962 if (width > len && repeat_accumulate(&acc, blank, width - len))
13963 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013964 if (dict && (argidx < arglen) && c != '%') {
13965 PyErr_SetString(PyExc_TypeError,
13966 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013967 goto onError;
13968 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013969 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013970 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013971 } /* until end */
13972 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013973 PyErr_SetString(PyExc_TypeError,
13974 "not all arguments converted during string formatting");
13975 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013976 }
13977
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013978 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013979 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013980 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013981 }
13982 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013983 Py_XDECREF(temp);
13984 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013985 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013986
Benjamin Peterson29060642009-01-31 22:14:21 +000013987 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013988 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013989 Py_XDECREF(temp);
13990 Py_XDECREF(second);
13991 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013992 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013993 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013994 }
13995 return NULL;
13996}
13997
Jeremy Hylton938ace62002-07-17 16:30:39 +000013998static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013999unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14000
Tim Peters6d6c1a32001-08-02 04:15:00 +000014001static PyObject *
14002unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14003{
Benjamin Peterson29060642009-01-31 22:14:21 +000014004 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014005 static char *kwlist[] = {"object", "encoding", "errors", 0};
14006 char *encoding = NULL;
14007 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014008
Benjamin Peterson14339b62009-01-31 16:36:08 +000014009 if (type != &PyUnicode_Type)
14010 return unicode_subtype_new(type, args, kwds);
14011 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014012 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014013 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014014 if (x == NULL) {
14015 Py_INCREF(unicode_empty);
14016 return unicode_empty;
14017 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014018 if (encoding == NULL && errors == NULL)
14019 return PyObject_Str(x);
14020 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014021 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014022}
14023
Guido van Rossume023fe02001-08-30 03:12:59 +000014024static PyObject *
14025unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14026{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014027 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014028 Py_ssize_t length, char_size;
14029 int share_wstr, share_utf8;
14030 unsigned int kind;
14031 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014032
Benjamin Peterson14339b62009-01-31 16:36:08 +000014033 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014034
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014035 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014036 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014037 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014038 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014039 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014040 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014041 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014042 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014043
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014044 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014045 if (self == NULL) {
14046 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014047 return NULL;
14048 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014049 kind = PyUnicode_KIND(unicode);
14050 length = PyUnicode_GET_LENGTH(unicode);
14051
14052 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014053#ifdef Py_DEBUG
14054 _PyUnicode_HASH(self) = -1;
14055#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014056 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014057#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014058 _PyUnicode_STATE(self).interned = 0;
14059 _PyUnicode_STATE(self).kind = kind;
14060 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014061 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014062 _PyUnicode_STATE(self).ready = 1;
14063 _PyUnicode_WSTR(self) = NULL;
14064 _PyUnicode_UTF8_LENGTH(self) = 0;
14065 _PyUnicode_UTF8(self) = NULL;
14066 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014067 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014068
14069 share_utf8 = 0;
14070 share_wstr = 0;
14071 if (kind == PyUnicode_1BYTE_KIND) {
14072 char_size = 1;
14073 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14074 share_utf8 = 1;
14075 }
14076 else if (kind == PyUnicode_2BYTE_KIND) {
14077 char_size = 2;
14078 if (sizeof(wchar_t) == 2)
14079 share_wstr = 1;
14080 }
14081 else {
14082 assert(kind == PyUnicode_4BYTE_KIND);
14083 char_size = 4;
14084 if (sizeof(wchar_t) == 4)
14085 share_wstr = 1;
14086 }
14087
14088 /* Ensure we won't overflow the length. */
14089 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14090 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014091 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014092 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014093 data = PyObject_MALLOC((length + 1) * char_size);
14094 if (data == NULL) {
14095 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014096 goto onError;
14097 }
14098
Victor Stinnerc3c74152011-10-02 20:39:55 +020014099 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014100 if (share_utf8) {
14101 _PyUnicode_UTF8_LENGTH(self) = length;
14102 _PyUnicode_UTF8(self) = data;
14103 }
14104 if (share_wstr) {
14105 _PyUnicode_WSTR_LENGTH(self) = length;
14106 _PyUnicode_WSTR(self) = (wchar_t *)data;
14107 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014108
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014109 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014110 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014111 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014112#ifdef Py_DEBUG
14113 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14114#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014115 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014116 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014117
14118onError:
14119 Py_DECREF(unicode);
14120 Py_DECREF(self);
14121 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014122}
14123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014124PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014125 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014126\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014127Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014128encoding defaults to the current default string encoding.\n\
14129errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014130
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014131static PyObject *unicode_iter(PyObject *seq);
14132
Guido van Rossumd57fd912000-03-10 22:53:23 +000014133PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014134 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014135 "str", /* tp_name */
14136 sizeof(PyUnicodeObject), /* tp_size */
14137 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014138 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014139 (destructor)unicode_dealloc, /* tp_dealloc */
14140 0, /* tp_print */
14141 0, /* tp_getattr */
14142 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014143 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014144 unicode_repr, /* tp_repr */
14145 &unicode_as_number, /* tp_as_number */
14146 &unicode_as_sequence, /* tp_as_sequence */
14147 &unicode_as_mapping, /* tp_as_mapping */
14148 (hashfunc) unicode_hash, /* tp_hash*/
14149 0, /* tp_call*/
14150 (reprfunc) unicode_str, /* tp_str */
14151 PyObject_GenericGetAttr, /* tp_getattro */
14152 0, /* tp_setattro */
14153 0, /* tp_as_buffer */
14154 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014155 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014156 unicode_doc, /* tp_doc */
14157 0, /* tp_traverse */
14158 0, /* tp_clear */
14159 PyUnicode_RichCompare, /* tp_richcompare */
14160 0, /* tp_weaklistoffset */
14161 unicode_iter, /* tp_iter */
14162 0, /* tp_iternext */
14163 unicode_methods, /* tp_methods */
14164 0, /* tp_members */
14165 0, /* tp_getset */
14166 &PyBaseObject_Type, /* tp_base */
14167 0, /* tp_dict */
14168 0, /* tp_descr_get */
14169 0, /* tp_descr_set */
14170 0, /* tp_dictoffset */
14171 0, /* tp_init */
14172 0, /* tp_alloc */
14173 unicode_new, /* tp_new */
14174 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014175};
14176
14177/* Initialize the Unicode implementation */
14178
Victor Stinner3a50e702011-10-18 21:21:00 +020014179int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014180{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014181 int i;
14182
Thomas Wouters477c8d52006-05-27 19:21:47 +000014183 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014184 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014185 0x000A, /* LINE FEED */
14186 0x000D, /* CARRIAGE RETURN */
14187 0x001C, /* FILE SEPARATOR */
14188 0x001D, /* GROUP SEPARATOR */
14189 0x001E, /* RECORD SEPARATOR */
14190 0x0085, /* NEXT LINE */
14191 0x2028, /* LINE SEPARATOR */
14192 0x2029, /* PARAGRAPH SEPARATOR */
14193 };
14194
Fred Drakee4315f52000-05-09 19:53:39 +000014195 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014196 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014197 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014198 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014199 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014201 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014202 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014203 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014204 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014205
14206 /* initialize the linebreak bloom filter */
14207 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014208 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014209 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014210
14211 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014212
14213#ifdef HAVE_MBCS
14214 winver.dwOSVersionInfoSize = sizeof(winver);
14215 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14216 PyErr_SetFromWindowsErr(0);
14217 return -1;
14218 }
14219#endif
14220 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014221}
14222
14223/* Finalize the Unicode implementation */
14224
Christian Heimesa156e092008-02-16 07:38:31 +000014225int
14226PyUnicode_ClearFreeList(void)
14227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014228 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014229}
14230
Guido van Rossumd57fd912000-03-10 22:53:23 +000014231void
Thomas Wouters78890102000-07-22 19:25:51 +000014232_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014233{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014234 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014235
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014236 Py_XDECREF(unicode_empty);
14237 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014238
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014239 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014240 if (unicode_latin1[i]) {
14241 Py_DECREF(unicode_latin1[i]);
14242 unicode_latin1[i] = NULL;
14243 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014244 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014245 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014246 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014247}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014248
Walter Dörwald16807132007-05-25 13:52:07 +000014249void
14250PyUnicode_InternInPlace(PyObject **p)
14251{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014252 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014253 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014254#ifdef Py_DEBUG
14255 assert(s != NULL);
14256 assert(_PyUnicode_CHECK(s));
14257#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014258 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014259 return;
14260#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014261 /* If it's a subclass, we don't really know what putting
14262 it in the interned dict might do. */
14263 if (!PyUnicode_CheckExact(s))
14264 return;
14265 if (PyUnicode_CHECK_INTERNED(s))
14266 return;
14267 if (interned == NULL) {
14268 interned = PyDict_New();
14269 if (interned == NULL) {
14270 PyErr_Clear(); /* Don't leave an exception */
14271 return;
14272 }
14273 }
14274 /* It might be that the GetItem call fails even
14275 though the key is present in the dictionary,
14276 namely when this happens during a stack overflow. */
14277 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014278 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014279 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014280
Benjamin Peterson29060642009-01-31 22:14:21 +000014281 if (t) {
14282 Py_INCREF(t);
14283 Py_DECREF(*p);
14284 *p = t;
14285 return;
14286 }
Walter Dörwald16807132007-05-25 13:52:07 +000014287
Benjamin Peterson14339b62009-01-31 16:36:08 +000014288 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014289 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014290 PyErr_Clear();
14291 PyThreadState_GET()->recursion_critical = 0;
14292 return;
14293 }
14294 PyThreadState_GET()->recursion_critical = 0;
14295 /* The two references in interned are not counted by refcnt.
14296 The deallocator will take care of this */
14297 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014298 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014299}
14300
14301void
14302PyUnicode_InternImmortal(PyObject **p)
14303{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014304 PyUnicode_InternInPlace(p);
14305 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014306 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014307 Py_INCREF(*p);
14308 }
Walter Dörwald16807132007-05-25 13:52:07 +000014309}
14310
14311PyObject *
14312PyUnicode_InternFromString(const char *cp)
14313{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014314 PyObject *s = PyUnicode_FromString(cp);
14315 if (s == NULL)
14316 return NULL;
14317 PyUnicode_InternInPlace(&s);
14318 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014319}
14320
Alexander Belopolsky40018472011-02-26 01:02:56 +000014321void
14322_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014323{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014324 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014325 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014326 Py_ssize_t i, n;
14327 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014328
Benjamin Peterson14339b62009-01-31 16:36:08 +000014329 if (interned == NULL || !PyDict_Check(interned))
14330 return;
14331 keys = PyDict_Keys(interned);
14332 if (keys == NULL || !PyList_Check(keys)) {
14333 PyErr_Clear();
14334 return;
14335 }
Walter Dörwald16807132007-05-25 13:52:07 +000014336
Benjamin Peterson14339b62009-01-31 16:36:08 +000014337 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14338 detector, interned unicode strings are not forcibly deallocated;
14339 rather, we give them their stolen references back, and then clear
14340 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014341
Benjamin Peterson14339b62009-01-31 16:36:08 +000014342 n = PyList_GET_SIZE(keys);
14343 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014344 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014345 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014346 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014347 if (PyUnicode_READY(s) == -1) {
14348 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014349 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014351 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014352 case SSTATE_NOT_INTERNED:
14353 /* XXX Shouldn't happen */
14354 break;
14355 case SSTATE_INTERNED_IMMORTAL:
14356 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014357 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014358 break;
14359 case SSTATE_INTERNED_MORTAL:
14360 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014361 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014362 break;
14363 default:
14364 Py_FatalError("Inconsistent interned string state.");
14365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014366 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014367 }
14368 fprintf(stderr, "total size of all interned strings: "
14369 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14370 "mortal/immortal\n", mortal_size, immortal_size);
14371 Py_DECREF(keys);
14372 PyDict_Clear(interned);
14373 Py_DECREF(interned);
14374 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014375}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014376
14377
14378/********************* Unicode Iterator **************************/
14379
14380typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014381 PyObject_HEAD
14382 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014383 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014384} unicodeiterobject;
14385
14386static void
14387unicodeiter_dealloc(unicodeiterobject *it)
14388{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014389 _PyObject_GC_UNTRACK(it);
14390 Py_XDECREF(it->it_seq);
14391 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014392}
14393
14394static int
14395unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14396{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014397 Py_VISIT(it->it_seq);
14398 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014399}
14400
14401static PyObject *
14402unicodeiter_next(unicodeiterobject *it)
14403{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014404 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014405
Benjamin Peterson14339b62009-01-31 16:36:08 +000014406 assert(it != NULL);
14407 seq = it->it_seq;
14408 if (seq == NULL)
14409 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014410 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014412 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14413 int kind = PyUnicode_KIND(seq);
14414 void *data = PyUnicode_DATA(seq);
14415 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14416 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014417 if (item != NULL)
14418 ++it->it_index;
14419 return item;
14420 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014421
Benjamin Peterson14339b62009-01-31 16:36:08 +000014422 Py_DECREF(seq);
14423 it->it_seq = NULL;
14424 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014425}
14426
14427static PyObject *
14428unicodeiter_len(unicodeiterobject *it)
14429{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014430 Py_ssize_t len = 0;
14431 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014432 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014433 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014434}
14435
14436PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14437
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014438static PyObject *
14439unicodeiter_reduce(unicodeiterobject *it)
14440{
14441 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014442 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014443 it->it_seq, it->it_index);
14444 } else {
14445 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14446 if (u == NULL)
14447 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014448 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014449 }
14450}
14451
14452PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14453
14454static PyObject *
14455unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14456{
14457 Py_ssize_t index = PyLong_AsSsize_t(state);
14458 if (index == -1 && PyErr_Occurred())
14459 return NULL;
14460 if (index < 0)
14461 index = 0;
14462 it->it_index = index;
14463 Py_RETURN_NONE;
14464}
14465
14466PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14467
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014468static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014469 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014470 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014471 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14472 reduce_doc},
14473 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14474 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014475 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014476};
14477
14478PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014479 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14480 "str_iterator", /* tp_name */
14481 sizeof(unicodeiterobject), /* tp_basicsize */
14482 0, /* tp_itemsize */
14483 /* methods */
14484 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14485 0, /* tp_print */
14486 0, /* tp_getattr */
14487 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014488 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014489 0, /* tp_repr */
14490 0, /* tp_as_number */
14491 0, /* tp_as_sequence */
14492 0, /* tp_as_mapping */
14493 0, /* tp_hash */
14494 0, /* tp_call */
14495 0, /* tp_str */
14496 PyObject_GenericGetAttr, /* tp_getattro */
14497 0, /* tp_setattro */
14498 0, /* tp_as_buffer */
14499 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14500 0, /* tp_doc */
14501 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14502 0, /* tp_clear */
14503 0, /* tp_richcompare */
14504 0, /* tp_weaklistoffset */
14505 PyObject_SelfIter, /* tp_iter */
14506 (iternextfunc)unicodeiter_next, /* tp_iternext */
14507 unicodeiter_methods, /* tp_methods */
14508 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014509};
14510
14511static PyObject *
14512unicode_iter(PyObject *seq)
14513{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014514 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014515
Benjamin Peterson14339b62009-01-31 16:36:08 +000014516 if (!PyUnicode_Check(seq)) {
14517 PyErr_BadInternalCall();
14518 return NULL;
14519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014520 if (PyUnicode_READY(seq) == -1)
14521 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014522 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14523 if (it == NULL)
14524 return NULL;
14525 it->it_index = 0;
14526 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014527 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014528 _PyObject_GC_TRACK(it);
14529 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014530}
14531
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014532
14533size_t
14534Py_UNICODE_strlen(const Py_UNICODE *u)
14535{
14536 int res = 0;
14537 while(*u++)
14538 res++;
14539 return res;
14540}
14541
14542Py_UNICODE*
14543Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14544{
14545 Py_UNICODE *u = s1;
14546 while ((*u++ = *s2++));
14547 return s1;
14548}
14549
14550Py_UNICODE*
14551Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14552{
14553 Py_UNICODE *u = s1;
14554 while ((*u++ = *s2++))
14555 if (n-- == 0)
14556 break;
14557 return s1;
14558}
14559
14560Py_UNICODE*
14561Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14562{
14563 Py_UNICODE *u1 = s1;
14564 u1 += Py_UNICODE_strlen(u1);
14565 Py_UNICODE_strcpy(u1, s2);
14566 return s1;
14567}
14568
14569int
14570Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14571{
14572 while (*s1 && *s2 && *s1 == *s2)
14573 s1++, s2++;
14574 if (*s1 && *s2)
14575 return (*s1 < *s2) ? -1 : +1;
14576 if (*s1)
14577 return 1;
14578 if (*s2)
14579 return -1;
14580 return 0;
14581}
14582
14583int
14584Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14585{
14586 register Py_UNICODE u1, u2;
14587 for (; n != 0; n--) {
14588 u1 = *s1;
14589 u2 = *s2;
14590 if (u1 != u2)
14591 return (u1 < u2) ? -1 : +1;
14592 if (u1 == '\0')
14593 return 0;
14594 s1++;
14595 s2++;
14596 }
14597 return 0;
14598}
14599
14600Py_UNICODE*
14601Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14602{
14603 const Py_UNICODE *p;
14604 for (p = s; *p; p++)
14605 if (*p == c)
14606 return (Py_UNICODE*)p;
14607 return NULL;
14608}
14609
14610Py_UNICODE*
14611Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14612{
14613 const Py_UNICODE *p;
14614 p = s + Py_UNICODE_strlen(s);
14615 while (p != s) {
14616 p--;
14617 if (*p == c)
14618 return (Py_UNICODE*)p;
14619 }
14620 return NULL;
14621}
Victor Stinner331ea922010-08-10 16:37:20 +000014622
Victor Stinner71133ff2010-09-01 23:43:53 +000014623Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014624PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014625{
Victor Stinner577db2c2011-10-11 22:12:48 +020014626 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014627 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014629 if (!PyUnicode_Check(unicode)) {
14630 PyErr_BadArgument();
14631 return NULL;
14632 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014633 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014634 if (u == NULL)
14635 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014636 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014637 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014638 PyErr_NoMemory();
14639 return NULL;
14640 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014641 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014642 size *= sizeof(Py_UNICODE);
14643 copy = PyMem_Malloc(size);
14644 if (copy == NULL) {
14645 PyErr_NoMemory();
14646 return NULL;
14647 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014648 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014649 return copy;
14650}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014651
Georg Brandl66c221e2010-10-14 07:04:07 +000014652/* A _string module, to export formatter_parser and formatter_field_name_split
14653 to the string.Formatter class implemented in Python. */
14654
14655static PyMethodDef _string_methods[] = {
14656 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14657 METH_O, PyDoc_STR("split the argument as a field name")},
14658 {"formatter_parser", (PyCFunction) formatter_parser,
14659 METH_O, PyDoc_STR("parse the argument as a format string")},
14660 {NULL, NULL}
14661};
14662
14663static struct PyModuleDef _string_module = {
14664 PyModuleDef_HEAD_INIT,
14665 "_string",
14666 PyDoc_STR("string helper module"),
14667 0,
14668 _string_methods,
14669 NULL,
14670 NULL,
14671 NULL,
14672 NULL
14673};
14674
14675PyMODINIT_FUNC
14676PyInit__string(void)
14677{
14678 return PyModule_Create(&_string_module);
14679}
14680
14681
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014682#ifdef __cplusplus
14683}
14684#endif