blob: 18bc07f908d2b6a3737df78336513722ec8f3492 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200378 void *data;
379 Py_UCS4 ch;
380
381 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 for (i=0; i < ascii->length; i++)
383 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200384 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200385 if (ch > maxchar)
386 maxchar = ch;
387 }
388 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100389 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100391 assert(maxchar <= 255);
392 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 else
394 assert(maxchar < 128);
395 }
Victor Stinner77faf692011-11-20 18:56:05 +0100396 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100398 assert(maxchar <= 0xFFFF);
399 }
400 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100402 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100403 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200404 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400406 return 1;
407}
Victor Stinner910337b2011-10-03 03:20:16 +0200408#endif
409
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100410static PyObject*
411unicode_result_wchar(PyObject *unicode)
412{
413#ifndef Py_DEBUG
414 Py_ssize_t len;
415
416 assert(Py_REFCNT(unicode) == 1);
417
418 len = _PyUnicode_WSTR_LENGTH(unicode);
419 if (len == 0) {
420 Py_INCREF(unicode_empty);
421 Py_DECREF(unicode);
422 return unicode_empty;
423 }
424
425 if (len == 1) {
426 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
427 if (ch < 256) {
428 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
429 Py_DECREF(unicode);
430 return latin1_char;
431 }
432 }
433
434 if (_PyUnicode_Ready(unicode) < 0) {
435 Py_XDECREF(unicode);
436 return NULL;
437 }
438#else
439 /* don't make the result ready in debug mode to ensure that the caller
440 makes the string ready before using it */
441 assert(_PyUnicode_CheckConsistency(unicode, 1));
442#endif
443 return unicode;
444}
445
446static PyObject*
447unicode_result_ready(PyObject *unicode)
448{
449 Py_ssize_t length;
450
451 length = PyUnicode_GET_LENGTH(unicode);
452 if (length == 0) {
453 if (unicode != unicode_empty) {
454 Py_INCREF(unicode_empty);
455 Py_DECREF(unicode);
456 }
457 return unicode_empty;
458 }
459
460 if (length == 1) {
461 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
462 if (ch < 256) {
463 PyObject *latin1_char = unicode_latin1[ch];
464 if (latin1_char != NULL) {
465 if (unicode != latin1_char) {
466 Py_INCREF(latin1_char);
467 Py_DECREF(unicode);
468 }
469 return latin1_char;
470 }
471 else {
472 assert(_PyUnicode_CheckConsistency(unicode, 1));
473 Py_INCREF(unicode);
474 unicode_latin1[ch] = unicode;
475 return unicode;
476 }
477 }
478 }
479
480 assert(_PyUnicode_CheckConsistency(unicode, 1));
481 return unicode;
482}
483
484static PyObject*
485unicode_result(PyObject *unicode)
486{
487 assert(_PyUnicode_CHECK(unicode));
488 if (PyUnicode_IS_READY(unicode))
489 return unicode_result_ready(unicode);
490 else
491 return unicode_result_wchar(unicode);
492}
493
Victor Stinnerc4b49542011-12-11 22:44:26 +0100494static PyObject*
495unicode_result_unchanged(PyObject *unicode)
496{
497 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500498 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100499 return NULL;
500 Py_INCREF(unicode);
501 return unicode;
502 }
503 else
504 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100505 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100506}
507
Victor Stinner3a50e702011-10-18 21:21:00 +0200508#ifdef HAVE_MBCS
509static OSVERSIONINFOEX winver;
510#endif
511
Thomas Wouters477c8d52006-05-27 19:21:47 +0000512/* --- Bloom Filters ----------------------------------------------------- */
513
514/* stuff to implement simple "bloom filters" for Unicode characters.
515 to keep things simple, we use a single bitmask, using the least 5
516 bits from each unicode characters as the bit index. */
517
518/* the linebreak mask is set up by Unicode_Init below */
519
Antoine Pitrouf068f942010-01-13 14:19:12 +0000520#if LONG_BIT >= 128
521#define BLOOM_WIDTH 128
522#elif LONG_BIT >= 64
523#define BLOOM_WIDTH 64
524#elif LONG_BIT >= 32
525#define BLOOM_WIDTH 32
526#else
527#error "LONG_BIT is smaller than 32"
528#endif
529
Thomas Wouters477c8d52006-05-27 19:21:47 +0000530#define BLOOM_MASK unsigned long
531
532static BLOOM_MASK bloom_linebreak;
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
535#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Benjamin Peterson29060642009-01-31 22:14:21 +0000537#define BLOOM_LINEBREAK(ch) \
538 ((ch) < 128U ? ascii_linebreak[(ch)] : \
539 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000540
Alexander Belopolsky40018472011-02-26 01:02:56 +0000541Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543{
544 /* calculate simple bloom-style bitmask for a given unicode string */
545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547 Py_ssize_t i;
548
549 mask = 0;
550 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552
553 return mask;
554}
555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556#define BLOOM_MEMBER(mask, chr, str) \
557 (BLOOM(mask, chr) \
558 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000559
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200560/* Compilation of templated routines */
561
562#include "stringlib/asciilib.h"
563#include "stringlib/fastsearch.h"
564#include "stringlib/partition.h"
565#include "stringlib/split.h"
566#include "stringlib/count.h"
567#include "stringlib/find.h"
568#include "stringlib/find_max_char.h"
569#include "stringlib/localeutil.h"
570#include "stringlib/undef.h"
571
572#include "stringlib/ucs1lib.h"
573#include "stringlib/fastsearch.h"
574#include "stringlib/partition.h"
575#include "stringlib/split.h"
576#include "stringlib/count.h"
577#include "stringlib/find.h"
578#include "stringlib/find_max_char.h"
579#include "stringlib/localeutil.h"
580#include "stringlib/undef.h"
581
582#include "stringlib/ucs2lib.h"
583#include "stringlib/fastsearch.h"
584#include "stringlib/partition.h"
585#include "stringlib/split.h"
586#include "stringlib/count.h"
587#include "stringlib/find.h"
588#include "stringlib/find_max_char.h"
589#include "stringlib/localeutil.h"
590#include "stringlib/undef.h"
591
592#include "stringlib/ucs4lib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200602#include "stringlib/unicodedefs.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/count.h"
605#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100606#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200607
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608/* --- Unicode Object ----------------------------------------------------- */
609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200611fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
614 Py_ssize_t size, Py_UCS4 ch,
615 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200616{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200617 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
618
619 switch (kind) {
620 case PyUnicode_1BYTE_KIND:
621 {
622 Py_UCS1 ch1 = (Py_UCS1) ch;
623 if (ch1 == ch)
624 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
625 else
626 return -1;
627 }
628 case PyUnicode_2BYTE_KIND:
629 {
630 Py_UCS2 ch2 = (Py_UCS2) ch;
631 if (ch2 == ch)
632 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
633 else
634 return -1;
635 }
636 case PyUnicode_4BYTE_KIND:
637 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
638 default:
639 assert(0);
640 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200642}
643
Victor Stinnerfe226c02011-10-03 03:52:20 +0200644static PyObject*
645resize_compact(PyObject *unicode, Py_ssize_t length)
646{
647 Py_ssize_t char_size;
648 Py_ssize_t struct_size;
649 Py_ssize_t new_size;
650 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100651 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100653 assert(PyUnicode_IS_COMPACT(unicode));
654
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200655 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100656 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 struct_size = sizeof(PyASCIIObject);
658 else
659 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200660 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
663 PyErr_NoMemory();
664 return NULL;
665 }
666 new_size = (struct_size + (length + 1) * char_size);
667
Victor Stinner84def372011-12-11 20:04:56 +0100668 _Py_DEC_REFTOTAL;
669 _Py_ForgetReference(unicode);
670
671 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
672 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100673 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 PyErr_NoMemory();
675 return NULL;
676 }
Victor Stinner84def372011-12-11 20:04:56 +0100677 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100679
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200681 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100683 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200684 _PyUnicode_WSTR_LENGTH(unicode) = length;
685 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
687 length, 0);
688 return unicode;
689}
690
Alexander Belopolsky40018472011-02-26 01:02:56 +0000691static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200692resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693{
Victor Stinner95663112011-10-04 01:03:50 +0200694 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100695 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200696 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000698
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699 if (PyUnicode_IS_READY(unicode)) {
700 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200701 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 void *data;
703
704 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200705 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200706 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
707 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200708
709 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
710 PyErr_NoMemory();
711 return -1;
712 }
713 new_size = (length + 1) * char_size;
714
Victor Stinner7a9105a2011-12-12 00:13:42 +0100715 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
716 {
717 PyObject_DEL(_PyUnicode_UTF8(unicode));
718 _PyUnicode_UTF8(unicode) = NULL;
719 _PyUnicode_UTF8_LENGTH(unicode) = 0;
720 }
721
Victor Stinnerfe226c02011-10-03 03:52:20 +0200722 data = (PyObject *)PyObject_REALLOC(data, new_size);
723 if (data == NULL) {
724 PyErr_NoMemory();
725 return -1;
726 }
727 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200728 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200729 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_WSTR_LENGTH(unicode) = length;
731 }
732 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200733 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200734 _PyUnicode_UTF8_LENGTH(unicode) = length;
735 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_LENGTH(unicode) = length;
737 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200738 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200739 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 }
Victor Stinner95663112011-10-04 01:03:50 +0200743 assert(_PyUnicode_WSTR(unicode) != NULL);
744
745 /* check for integer overflow */
746 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
747 PyErr_NoMemory();
748 return -1;
749 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100750 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200751 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100752 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200753 if (!wstr) {
754 PyErr_NoMemory();
755 return -1;
756 }
757 _PyUnicode_WSTR(unicode) = wstr;
758 _PyUnicode_WSTR(unicode)[length] = 0;
759 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200760 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761 return 0;
762}
763
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764static PyObject*
765resize_copy(PyObject *unicode, Py_ssize_t length)
766{
767 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100770
Benjamin Petersonbac79492012-01-14 13:34:47 -0500771 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100772 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
775 if (copy == NULL)
776 return NULL;
777
778 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200779 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200780 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200781 }
782 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200783 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100784
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200785 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200786 if (w == NULL)
787 return NULL;
788 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
789 copy_length = Py_MIN(copy_length, length);
790 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
791 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200792 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793 }
794}
795
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000797 Ux0000 terminated; some code (e.g. new_identifier)
798 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000799
800 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000801 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802
803*/
804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200805#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200806static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807#endif
808
Alexander Belopolsky40018472011-02-26 01:02:56 +0000809static PyUnicodeObject *
810_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811{
812 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814
Thomas Wouters477c8d52006-05-27 19:21:47 +0000815 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816 if (length == 0 && unicode_empty != NULL) {
817 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200818 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819 }
820
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000821 /* Ensure we won't overflow the size. */
822 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
823 return (PyUnicodeObject *)PyErr_NoMemory();
824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200825 if (length < 0) {
826 PyErr_SetString(PyExc_SystemError,
827 "Negative size passed to _PyUnicode_New");
828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829 }
830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831#ifdef Py_DEBUG
832 ++unicode_old_new_calls;
833#endif
834
835 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
836 if (unicode == NULL)
837 return NULL;
838 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
839 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
840 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100841 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000842 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100843 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200845
Jeremy Hyltond8082792003-09-16 19:41:39 +0000846 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000847 * the caller fails before initializing str -- unicode_resize()
848 * reads str[0], and the Keep-Alive optimization can keep memory
849 * allocated for str alive across a call to unicode_dealloc(unicode).
850 * We don't want unicode_resize to read uninitialized memory in
851 * that case.
852 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200853 _PyUnicode_WSTR(unicode)[0] = 0;
854 _PyUnicode_WSTR(unicode)[length] = 0;
855 _PyUnicode_WSTR_LENGTH(unicode) = length;
856 _PyUnicode_HASH(unicode) = -1;
857 _PyUnicode_STATE(unicode).interned = 0;
858 _PyUnicode_STATE(unicode).kind = 0;
859 _PyUnicode_STATE(unicode).compact = 0;
860 _PyUnicode_STATE(unicode).ready = 0;
861 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200862 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200863 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200864 _PyUnicode_UTF8(unicode) = NULL;
865 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100866 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867 return unicode;
868}
869
Victor Stinnerf42dc442011-10-02 23:33:16 +0200870static const char*
871unicode_kind_name(PyObject *unicode)
872{
Victor Stinner42dfd712011-10-03 14:41:45 +0200873 /* don't check consistency: unicode_kind_name() is called from
874 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200875 if (!PyUnicode_IS_COMPACT(unicode))
876 {
877 if (!PyUnicode_IS_READY(unicode))
878 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600879 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200880 {
881 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200882 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200883 return "legacy ascii";
884 else
885 return "legacy latin1";
886 case PyUnicode_2BYTE_KIND:
887 return "legacy UCS2";
888 case PyUnicode_4BYTE_KIND:
889 return "legacy UCS4";
890 default:
891 return "<legacy invalid kind>";
892 }
893 }
894 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600895 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200897 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200898 return "ascii";
899 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200902 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200903 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200904 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200905 default:
906 return "<invalid compact kind>";
907 }
908}
909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200910#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200911static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912
913/* Functions wrapping macros for use in debugger */
914char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200915 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200916}
917
918void *_PyUnicode_compact_data(void *unicode) {
919 return _PyUnicode_COMPACT_DATA(unicode);
920}
921void *_PyUnicode_data(void *unicode){
922 printf("obj %p\n", unicode);
923 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
924 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
925 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
926 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
927 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
928 return PyUnicode_DATA(unicode);
929}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200930
931void
932_PyUnicode_Dump(PyObject *op)
933{
934 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
936 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
937 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200938
Victor Stinnera849a4b2011-10-03 12:12:11 +0200939 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200940 {
941 if (ascii->state.ascii)
942 data = (ascii + 1);
943 else
944 data = (compact + 1);
945 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 else
947 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200948 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
949
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 if (ascii->wstr == data)
951 printf("shared ");
952 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200953
Victor Stinnera3b334d2011-10-03 13:53:37 +0200954 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 printf(" (%zu), ", compact->wstr_length);
956 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
957 printf("shared ");
958 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200959 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200960 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200961}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200962#endif
963
964PyObject *
965PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
966{
967 PyObject *obj;
968 PyCompactUnicodeObject *unicode;
969 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200970 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200971 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 Py_ssize_t char_size;
973 Py_ssize_t struct_size;
974
975 /* Optimization for empty strings */
976 if (size == 0 && unicode_empty != NULL) {
977 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200978 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 }
980
981#ifdef Py_DEBUG
982 ++unicode_new_new_calls;
983#endif
984
Victor Stinner9e9d6892011-10-04 01:02:02 +0200985 is_ascii = 0;
986 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 struct_size = sizeof(PyCompactUnicodeObject);
988 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200989 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 char_size = 1;
991 is_ascii = 1;
992 struct_size = sizeof(PyASCIIObject);
993 }
994 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +0200995 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200996 char_size = 1;
997 }
998 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +0200999 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000 char_size = 2;
1001 if (sizeof(wchar_t) == 2)
1002 is_sharing = 1;
1003 }
1004 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001005 if (maxchar > MAX_UNICODE) {
1006 PyErr_SetString(PyExc_SystemError,
1007 "invalid maximum character passed to PyUnicode_New");
1008 return NULL;
1009 }
Victor Stinner8f825062012-04-27 13:55:39 +02001010 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001011 char_size = 4;
1012 if (sizeof(wchar_t) == 4)
1013 is_sharing = 1;
1014 }
1015
1016 /* Ensure we won't overflow the size. */
1017 if (size < 0) {
1018 PyErr_SetString(PyExc_SystemError,
1019 "Negative size passed to PyUnicode_New");
1020 return NULL;
1021 }
1022 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1023 return PyErr_NoMemory();
1024
1025 /* Duplicated allocation code from _PyObject_New() instead of a call to
1026 * PyObject_New() so we are able to allocate space for the object and
1027 * it's data buffer.
1028 */
1029 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1030 if (obj == NULL)
1031 return PyErr_NoMemory();
1032 obj = PyObject_INIT(obj, &PyUnicode_Type);
1033 if (obj == NULL)
1034 return NULL;
1035
1036 unicode = (PyCompactUnicodeObject *)obj;
1037 if (is_ascii)
1038 data = ((PyASCIIObject*)obj) + 1;
1039 else
1040 data = unicode + 1;
1041 _PyUnicode_LENGTH(unicode) = size;
1042 _PyUnicode_HASH(unicode) = -1;
1043 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001044 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 _PyUnicode_STATE(unicode).compact = 1;
1046 _PyUnicode_STATE(unicode).ready = 1;
1047 _PyUnicode_STATE(unicode).ascii = is_ascii;
1048 if (is_ascii) {
1049 ((char*)data)[size] = 0;
1050 _PyUnicode_WSTR(unicode) = NULL;
1051 }
Victor Stinner8f825062012-04-27 13:55:39 +02001052 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 ((char*)data)[size] = 0;
1054 _PyUnicode_WSTR(unicode) = NULL;
1055 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001057 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 else {
1060 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001061 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001062 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001064 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 ((Py_UCS4*)data)[size] = 0;
1066 if (is_sharing) {
1067 _PyUnicode_WSTR_LENGTH(unicode) = size;
1068 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1069 }
1070 else {
1071 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1072 _PyUnicode_WSTR(unicode) = NULL;
1073 }
1074 }
Victor Stinner8f825062012-04-27 13:55:39 +02001075#ifdef Py_DEBUG
1076 /* Fill the data with invalid characters to detect bugs earlier.
1077 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1078 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1079 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1080 memset(data, 0xff, size * kind);
1081#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001082 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083 return obj;
1084}
1085
1086#if SIZEOF_WCHAR_T == 2
1087/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1088 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001089 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090
1091 This function assumes that unicode can hold one more code point than wstr
1092 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001093static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001095 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096{
1097 const wchar_t *iter;
1098 Py_UCS4 *ucs4_out;
1099
Victor Stinner910337b2011-10-03 03:20:16 +02001100 assert(unicode != NULL);
1101 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1103 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1104
1105 for (iter = begin; iter < end; ) {
1106 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1107 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001108 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1109 && (iter+1) < end
1110 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 {
Victor Stinner551ac952011-11-29 22:58:13 +01001112 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 iter += 2;
1114 }
1115 else {
1116 *ucs4_out++ = *iter;
1117 iter++;
1118 }
1119 }
1120 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1121 _PyUnicode_GET_LENGTH(unicode)));
1122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123}
1124#endif
1125
Victor Stinnercd9950f2011-10-02 00:34:53 +02001126static int
Victor Stinner488fa492011-12-12 00:01:39 +01001127unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001128{
Victor Stinner488fa492011-12-12 00:01:39 +01001129 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001130 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001131 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001132 return -1;
1133 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001134 return 0;
1135}
1136
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001137static int
1138_copy_characters(PyObject *to, Py_ssize_t to_start,
1139 PyObject *from, Py_ssize_t from_start,
1140 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001142 unsigned int from_kind, to_kind;
1143 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001146 assert(PyUnicode_Check(from));
1147 assert(PyUnicode_Check(to));
1148 assert(PyUnicode_IS_READY(from));
1149 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001151 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1152 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1153 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001155 if (how_many == 0)
1156 return 0;
1157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001159 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001161 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001163#ifdef Py_DEBUG
1164 if (!check_maxchar
1165 && (from_kind > to_kind
1166 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001168 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1169 Py_UCS4 ch;
1170 Py_ssize_t i;
1171 for (i=0; i < how_many; i++) {
1172 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1173 assert(ch <= to_maxchar);
1174 }
1175 }
1176#endif
1177 fast = (from_kind == to_kind);
1178 if (check_maxchar
1179 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1180 {
1181 /* deny latin1 => ascii */
1182 fast = 0;
1183 }
1184
1185 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001186 Py_MEMCPY((char*)to_data + to_kind * to_start,
1187 (char*)from_data + from_kind * from_start,
1188 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001190 else if (from_kind == PyUnicode_1BYTE_KIND
1191 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001192 {
1193 _PyUnicode_CONVERT_BYTES(
1194 Py_UCS1, Py_UCS2,
1195 PyUnicode_1BYTE_DATA(from) + from_start,
1196 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1197 PyUnicode_2BYTE_DATA(to) + to_start
1198 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001199 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001200 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001201 && to_kind == PyUnicode_4BYTE_KIND)
1202 {
1203 _PyUnicode_CONVERT_BYTES(
1204 Py_UCS1, Py_UCS4,
1205 PyUnicode_1BYTE_DATA(from) + from_start,
1206 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1207 PyUnicode_4BYTE_DATA(to) + to_start
1208 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001209 }
1210 else if (from_kind == PyUnicode_2BYTE_KIND
1211 && to_kind == PyUnicode_4BYTE_KIND)
1212 {
1213 _PyUnicode_CONVERT_BYTES(
1214 Py_UCS2, Py_UCS4,
1215 PyUnicode_2BYTE_DATA(from) + from_start,
1216 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1217 PyUnicode_4BYTE_DATA(to) + to_start
1218 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001219 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001221 /* check if max_char(from substring) <= max_char(to) */
1222 if (from_kind > to_kind
1223 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001224 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001225 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001226 /* slow path to check for character overflow */
1227 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001228 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001229 Py_ssize_t i;
1230
Victor Stinner56c161a2011-10-06 02:47:11 +02001231#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001232 for (i=0; i < how_many; i++) {
1233 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001234 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001235 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1236 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001237#else
1238 if (!check_maxchar) {
1239 for (i=0; i < how_many; i++) {
1240 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1241 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1242 }
1243 }
1244 else {
1245 for (i=0; i < how_many; i++) {
1246 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1247 if (ch > to_maxchar)
1248 return 1;
1249 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1250 }
1251 }
1252#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001253 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001254 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001255 assert(0 && "inconsistent state");
1256 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001257 }
1258 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001259 return 0;
1260}
1261
1262static void
1263copy_characters(PyObject *to, Py_ssize_t to_start,
1264 PyObject *from, Py_ssize_t from_start,
1265 Py_ssize_t how_many)
1266{
1267 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1268}
1269
1270Py_ssize_t
1271PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1272 PyObject *from, Py_ssize_t from_start,
1273 Py_ssize_t how_many)
1274{
1275 int err;
1276
1277 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1278 PyErr_BadInternalCall();
1279 return -1;
1280 }
1281
Benjamin Petersonbac79492012-01-14 13:34:47 -05001282 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001283 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001284 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001285 return -1;
1286
1287 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1288 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1289 PyErr_Format(PyExc_SystemError,
1290 "Cannot write %zi characters at %zi "
1291 "in a string of %zi characters",
1292 how_many, to_start, PyUnicode_GET_LENGTH(to));
1293 return -1;
1294 }
1295
1296 if (how_many == 0)
1297 return 0;
1298
Victor Stinner488fa492011-12-12 00:01:39 +01001299 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001300 return -1;
1301
1302 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1303 if (err) {
1304 PyErr_Format(PyExc_SystemError,
1305 "Cannot copy %s characters "
1306 "into a string of %s characters",
1307 unicode_kind_name(from),
1308 unicode_kind_name(to));
1309 return -1;
1310 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001311 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312}
1313
Victor Stinner17222162011-09-28 22:15:37 +02001314/* Find the maximum code point and count the number of surrogate pairs so a
1315 correct string length can be computed before converting a string to UCS4.
1316 This function counts single surrogates as a character and not as a pair.
1317
1318 Return 0 on success, or -1 on error. */
1319static int
1320find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1321 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322{
1323 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001324 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325
Victor Stinnerc53be962011-10-02 21:33:54 +02001326 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 *num_surrogates = 0;
1328 *maxchar = 0;
1329
1330 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001332 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1333 && (iter+1) < end
1334 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001336 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 iter += 2;
1339 }
1340 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001342 {
1343 ch = *iter;
1344 iter++;
1345 }
1346 if (ch > *maxchar) {
1347 *maxchar = ch;
1348 if (*maxchar > MAX_UNICODE) {
1349 PyErr_Format(PyExc_ValueError,
1350 "character U+%x is not in range [U+0000; U+10ffff]",
1351 ch);
1352 return -1;
1353 }
1354 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355 }
1356 return 0;
1357}
1358
1359#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001360static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361#endif
1362
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001363int
1364_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365{
1366 wchar_t *end;
1367 Py_UCS4 maxchar = 0;
1368 Py_ssize_t num_surrogates;
1369#if SIZEOF_WCHAR_T == 2
1370 Py_ssize_t length_wo_surrogates;
1371#endif
1372
Georg Brandl7597add2011-10-05 16:36:47 +02001373 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001374 strings were created using _PyObject_New() and where no canonical
1375 representation (the str field) has been set yet aka strings
1376 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001377 assert(_PyUnicode_CHECK(unicode));
1378 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001379 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001380 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001381 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001382 /* Actually, it should neither be interned nor be anything else: */
1383 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384
1385#ifdef Py_DEBUG
1386 ++unicode_ready_calls;
1387#endif
1388
1389 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001390 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001391 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393
1394 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001395 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1396 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397 PyErr_NoMemory();
1398 return -1;
1399 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001400 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 _PyUnicode_WSTR(unicode), end,
1402 PyUnicode_1BYTE_DATA(unicode));
1403 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1404 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1405 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1406 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001407 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001409 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 }
1411 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001412 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001413 _PyUnicode_UTF8(unicode) = NULL;
1414 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 }
1416 PyObject_FREE(_PyUnicode_WSTR(unicode));
1417 _PyUnicode_WSTR(unicode) = NULL;
1418 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1419 }
1420 /* In this case we might have to convert down from 4-byte native
1421 wchar_t to 2-byte unicode. */
1422 else if (maxchar < 65536) {
1423 assert(num_surrogates == 0 &&
1424 "FindMaxCharAndNumSurrogatePairs() messed up");
1425
Victor Stinner506f5922011-09-28 22:34:18 +02001426#if SIZEOF_WCHAR_T == 2
1427 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001428 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434#else
1435 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001436 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001437 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001438 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001439 PyErr_NoMemory();
1440 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinner506f5922011-09-28 22:34:18 +02001442 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1443 _PyUnicode_WSTR(unicode), end,
1444 PyUnicode_2BYTE_DATA(unicode));
1445 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1446 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1447 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001448 _PyUnicode_UTF8(unicode) = NULL;
1449 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001450 PyObject_FREE(_PyUnicode_WSTR(unicode));
1451 _PyUnicode_WSTR(unicode) = NULL;
1452 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1453#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001454 }
1455 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1456 else {
1457#if SIZEOF_WCHAR_T == 2
1458 /* in case the native representation is 2-bytes, we need to allocate a
1459 new normalized 4-byte version. */
1460 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001461 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1462 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 PyErr_NoMemory();
1464 return -1;
1465 }
1466 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001468 _PyUnicode_UTF8(unicode) = NULL;
1469 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001470 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001472 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 PyObject_FREE(_PyUnicode_WSTR(unicode));
1474 _PyUnicode_WSTR(unicode) = NULL;
1475 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1476#else
1477 assert(num_surrogates == 0);
1478
Victor Stinnerc3c74152011-10-02 20:39:55 +02001479 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001481 _PyUnicode_UTF8(unicode) = NULL;
1482 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1484#endif
1485 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1486 }
1487 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001488 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 return 0;
1490}
1491
Alexander Belopolsky40018472011-02-26 01:02:56 +00001492static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001493unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494{
Walter Dörwald16807132007-05-25 13:52:07 +00001495 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 case SSTATE_NOT_INTERNED:
1497 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001498
Benjamin Peterson29060642009-01-31 22:14:21 +00001499 case SSTATE_INTERNED_MORTAL:
1500 /* revive dead object temporarily for DelItem */
1501 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001502 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001503 Py_FatalError(
1504 "deletion of interned string failed");
1505 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001506
Benjamin Peterson29060642009-01-31 22:14:21 +00001507 case SSTATE_INTERNED_IMMORTAL:
1508 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001509
Benjamin Peterson29060642009-01-31 22:14:21 +00001510 default:
1511 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001512 }
1513
Victor Stinner03490912011-10-03 23:45:12 +02001514 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001516 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001517 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001518 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1519 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001521 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001522}
1523
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524#ifdef Py_DEBUG
1525static int
1526unicode_is_singleton(PyObject *unicode)
1527{
1528 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1529 if (unicode == unicode_empty)
1530 return 1;
1531 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1532 {
1533 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1534 if (ch < 256 && unicode_latin1[ch] == unicode)
1535 return 1;
1536 }
1537 return 0;
1538}
1539#endif
1540
Alexander Belopolsky40018472011-02-26 01:02:56 +00001541static int
Victor Stinner488fa492011-12-12 00:01:39 +01001542unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543{
Victor Stinner488fa492011-12-12 00:01:39 +01001544 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001545 if (Py_REFCNT(unicode) != 1)
1546 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001547 if (_PyUnicode_HASH(unicode) != -1)
1548 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549 if (PyUnicode_CHECK_INTERNED(unicode))
1550 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001551 if (!PyUnicode_CheckExact(unicode))
1552 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001553#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001554 /* singleton refcount is greater than 1 */
1555 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001556#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001557 return 1;
1558}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001559
Victor Stinnerfe226c02011-10-03 03:52:20 +02001560static int
1561unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1562{
1563 PyObject *unicode;
1564 Py_ssize_t old_length;
1565
1566 assert(p_unicode != NULL);
1567 unicode = *p_unicode;
1568
1569 assert(unicode != NULL);
1570 assert(PyUnicode_Check(unicode));
1571 assert(0 <= length);
1572
Victor Stinner910337b2011-10-03 03:20:16 +02001573 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001574 old_length = PyUnicode_WSTR_LENGTH(unicode);
1575 else
1576 old_length = PyUnicode_GET_LENGTH(unicode);
1577 if (old_length == length)
1578 return 0;
1579
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001580 if (length == 0) {
1581 Py_DECREF(*p_unicode);
1582 *p_unicode = unicode_empty;
1583 Py_INCREF(*p_unicode);
1584 return 0;
1585 }
1586
Victor Stinner488fa492011-12-12 00:01:39 +01001587 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 PyObject *copy = resize_copy(unicode, length);
1589 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001590 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001591 Py_DECREF(*p_unicode);
1592 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001593 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001594 }
1595
Victor Stinnerfe226c02011-10-03 03:52:20 +02001596 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001597 PyObject *new_unicode = resize_compact(unicode, length);
1598 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001600 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001601 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001602 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001603 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001604 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001605}
1606
Alexander Belopolsky40018472011-02-26 01:02:56 +00001607int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001608PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001609{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001610 PyObject *unicode;
1611 if (p_unicode == NULL) {
1612 PyErr_BadInternalCall();
1613 return -1;
1614 }
1615 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001616 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001617 {
1618 PyErr_BadInternalCall();
1619 return -1;
1620 }
1621 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001622}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001623
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001624static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001625unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001626{
1627 PyObject *result;
1628 assert(PyUnicode_IS_READY(*p_unicode));
1629 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1630 return 0;
1631 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1632 maxchar);
1633 if (result == NULL)
1634 return -1;
1635 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1636 PyUnicode_GET_LENGTH(*p_unicode));
1637 Py_DECREF(*p_unicode);
1638 *p_unicode = result;
1639 return 0;
1640}
1641
1642static int
1643unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1644 Py_UCS4 ch)
1645{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001646 assert(ch <= MAX_UNICODE);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647 if (unicode_widen(p_unicode, ch) < 0)
1648 return -1;
1649 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1650 PyUnicode_DATA(*p_unicode),
1651 (*pos)++, ch);
1652 return 0;
1653}
1654
Victor Stinnerc5166102012-02-22 13:55:02 +01001655/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1656 Return the length of the input string.
1657
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001658 WARNING: The function doesn't copy the terminating null character and
1659 doesn't check the maximum character (may write a latin1 character in an
1660 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001661static Py_ssize_t
1662unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1663{
1664 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1665 void *data = PyUnicode_DATA(unicode);
1666
1667 switch (kind) {
1668 case PyUnicode_1BYTE_KIND: {
1669 Py_ssize_t len = strlen(str);
1670 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001671 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001672 return len;
1673 }
1674 case PyUnicode_2BYTE_KIND: {
1675 Py_UCS2 *start = (Py_UCS2 *)data + index;
1676 Py_UCS2 *ucs2 = start;
1677 assert(index <= PyUnicode_GET_LENGTH(unicode));
1678
1679 for (; *str; ++ucs2, ++str)
1680 *ucs2 = (Py_UCS2)*str;
1681
1682 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1683 return ucs2 - start;
1684 }
1685 default: {
1686 Py_UCS4 *start = (Py_UCS4 *)data + index;
1687 Py_UCS4 *ucs4 = start;
1688 assert(kind == PyUnicode_4BYTE_KIND);
1689 assert(index <= PyUnicode_GET_LENGTH(unicode));
1690
1691 for (; *str; ++ucs4, ++str)
1692 *ucs4 = (Py_UCS4)*str;
1693
1694 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1695 return ucs4 - start;
1696 }
1697 }
1698}
1699
1700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701static PyObject*
1702get_latin1_char(unsigned char ch)
1703{
Victor Stinnera464fc12011-10-02 20:39:30 +02001704 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001706 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001707 if (!unicode)
1708 return NULL;
1709 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001710 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 unicode_latin1[ch] = unicode;
1712 }
1713 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001714 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715}
1716
Alexander Belopolsky40018472011-02-26 01:02:56 +00001717PyObject *
1718PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001720 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721 Py_UCS4 maxchar = 0;
1722 Py_ssize_t num_surrogates;
1723
1724 if (u == NULL)
1725 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001727 /* If the Unicode data is known at construction time, we can apply
1728 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730 /* Optimization for empty strings */
1731 if (size == 0 && unicode_empty != NULL) {
1732 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001733 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001734 }
Tim Petersced69f82003-09-16 20:30:58 +00001735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 /* Single character Unicode objects in the Latin-1 range are
1737 shared when using this constructor */
1738 if (size == 1 && *u < 256)
1739 return get_latin1_char((unsigned char)*u);
1740
1741 /* If not empty and not single character, copy the Unicode data
1742 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001743 if (find_maxchar_surrogates(u, u + size,
1744 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 return NULL;
1746
Victor Stinner8faf8212011-12-08 22:14:11 +01001747 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 if (!unicode)
1749 return NULL;
1750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 switch (PyUnicode_KIND(unicode)) {
1752 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001753 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1755 break;
1756 case PyUnicode_2BYTE_KIND:
1757#if Py_UNICODE_SIZE == 2
1758 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1759#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001760 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1762#endif
1763 break;
1764 case PyUnicode_4BYTE_KIND:
1765#if SIZEOF_WCHAR_T == 2
1766 /* This is the only case which has to process surrogates, thus
1767 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001768 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769#else
1770 assert(num_surrogates == 0);
1771 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1772#endif
1773 break;
1774 default:
1775 assert(0 && "Impossible state");
1776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001778 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779}
1780
Alexander Belopolsky40018472011-02-26 01:02:56 +00001781PyObject *
1782PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001783{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001784 if (size < 0) {
1785 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001786 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001787 return NULL;
1788 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001789 if (u != NULL)
1790 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1791 else
1792 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001793}
1794
Alexander Belopolsky40018472011-02-26 01:02:56 +00001795PyObject *
1796PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001797{
1798 size_t size = strlen(u);
1799 if (size > PY_SSIZE_T_MAX) {
1800 PyErr_SetString(PyExc_OverflowError, "input too long");
1801 return NULL;
1802 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001803 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001804}
1805
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001806PyObject *
1807_PyUnicode_FromId(_Py_Identifier *id)
1808{
1809 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001810 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1811 strlen(id->string),
1812 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001813 if (!id->object)
1814 return NULL;
1815 PyUnicode_InternInPlace(&id->object);
1816 assert(!id->next);
1817 id->next = static_strings;
1818 static_strings = id;
1819 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001820 return id->object;
1821}
1822
1823void
1824_PyUnicode_ClearStaticStrings()
1825{
1826 _Py_Identifier *i;
1827 for (i = static_strings; i; i = i->next) {
1828 Py_DECREF(i->object);
1829 i->object = NULL;
1830 i->next = NULL;
1831 }
1832}
1833
Benjamin Peterson0df54292012-03-26 14:50:32 -04001834/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001835
Victor Stinnere57b1c02011-09-28 22:20:48 +02001836static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001837unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001838{
Victor Stinner785938e2011-12-11 20:09:03 +01001839 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001840 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001841#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001842 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001843#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001844 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001845 }
Victor Stinner785938e2011-12-11 20:09:03 +01001846 unicode = PyUnicode_New(size, 127);
1847 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001848 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001849 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1850 assert(_PyUnicode_CheckConsistency(unicode, 1));
1851 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001852}
1853
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001854static Py_UCS4
1855kind_maxchar_limit(unsigned int kind)
1856{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001857 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001858 case PyUnicode_1BYTE_KIND:
1859 return 0x80;
1860 case PyUnicode_2BYTE_KIND:
1861 return 0x100;
1862 case PyUnicode_4BYTE_KIND:
1863 return 0x10000;
1864 default:
1865 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001866 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001867 }
1868}
1869
Victor Stinner702c7342011-10-05 13:50:52 +02001870static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001871_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001872{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001874 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001875
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001876 if (size == 0) {
1877 Py_INCREF(unicode_empty);
1878 return unicode_empty;
1879 }
1880 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001881 if (size == 1)
1882 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001883
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001884 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001885 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 if (!res)
1887 return NULL;
1888 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001889 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001891}
1892
Victor Stinnere57b1c02011-09-28 22:20:48 +02001893static PyObject*
1894_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895{
1896 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001897 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001898
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001899 if (size == 0) {
1900 Py_INCREF(unicode_empty);
1901 return unicode_empty;
1902 }
1903 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001904 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001905 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001906
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001907 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001908 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 if (!res)
1910 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001911 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001913 else {
1914 _PyUnicode_CONVERT_BYTES(
1915 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1916 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001917 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 return res;
1919}
1920
Victor Stinnere57b1c02011-09-28 22:20:48 +02001921static PyObject*
1922_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923{
1924 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001925 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001926
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001927 if (size == 0) {
1928 Py_INCREF(unicode_empty);
1929 return unicode_empty;
1930 }
1931 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001932 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001933 return get_latin1_char((unsigned char)u[0]);
1934
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001936 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001937 if (!res)
1938 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001939 if (max_char < 256)
1940 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1941 PyUnicode_1BYTE_DATA(res));
1942 else if (max_char < 0x10000)
1943 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1944 PyUnicode_2BYTE_DATA(res));
1945 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001947 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 return res;
1949}
1950
1951PyObject*
1952PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1953{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001954 if (size < 0) {
1955 PyErr_SetString(PyExc_ValueError, "size must be positive");
1956 return NULL;
1957 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001958 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001960 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001962 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001964 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001965 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001966 PyErr_SetString(PyExc_SystemError, "invalid kind");
1967 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969}
1970
Victor Stinnerece58de2012-04-23 23:36:38 +02001971Py_UCS4
1972_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
1973{
1974 enum PyUnicode_Kind kind;
1975 void *startptr, *endptr;
1976
1977 assert(PyUnicode_IS_READY(unicode));
1978 assert(0 <= start);
1979 assert(end <= PyUnicode_GET_LENGTH(unicode));
1980 assert(start <= end);
1981
1982 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
1983 return PyUnicode_MAX_CHAR_VALUE(unicode);
1984
1985 if (start == end)
1986 return 127;
1987
Victor Stinner94d558b2012-04-27 22:26:58 +02001988 if (PyUnicode_IS_ASCII(unicode))
1989 return 127;
1990
Victor Stinnerece58de2012-04-23 23:36:38 +02001991 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04001992 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04001993 endptr = (char *)startptr + end * kind;
1994 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04001995 switch(kind) {
1996 case PyUnicode_1BYTE_KIND:
1997 return ucs1lib_find_max_char(startptr, endptr);
1998 case PyUnicode_2BYTE_KIND:
1999 return ucs2lib_find_max_char(startptr, endptr);
2000 case PyUnicode_4BYTE_KIND:
2001 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002002 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002003 assert(0);
2004 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002005 }
2006}
2007
Victor Stinner25a4b292011-10-06 12:31:55 +02002008/* Ensure that a string uses the most efficient storage, if it is not the
2009 case: create a new string with of the right kind. Write NULL into *p_unicode
2010 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002011static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002012unicode_adjust_maxchar(PyObject **p_unicode)
2013{
2014 PyObject *unicode, *copy;
2015 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002016 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002017 unsigned int kind;
2018
2019 assert(p_unicode != NULL);
2020 unicode = *p_unicode;
2021 assert(PyUnicode_IS_READY(unicode));
2022 if (PyUnicode_IS_ASCII(unicode))
2023 return;
2024
2025 len = PyUnicode_GET_LENGTH(unicode);
2026 kind = PyUnicode_KIND(unicode);
2027 if (kind == PyUnicode_1BYTE_KIND) {
2028 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002029 max_char = ucs1lib_find_max_char(u, u + len);
2030 if (max_char >= 128)
2031 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002032 }
2033 else if (kind == PyUnicode_2BYTE_KIND) {
2034 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002035 max_char = ucs2lib_find_max_char(u, u + len);
2036 if (max_char >= 256)
2037 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002038 }
2039 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002040 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002041 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002042 max_char = ucs4lib_find_max_char(u, u + len);
2043 if (max_char >= 0x10000)
2044 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002045 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002046 copy = PyUnicode_New(len, max_char);
2047 copy_characters(copy, 0, unicode, 0, len);
2048 Py_DECREF(unicode);
2049 *p_unicode = copy;
2050}
2051
Victor Stinner034f6cf2011-09-30 02:26:44 +02002052PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002053_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002054{
Victor Stinner87af4f22011-11-21 23:03:47 +01002055 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002056 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002057
Victor Stinner034f6cf2011-09-30 02:26:44 +02002058 if (!PyUnicode_Check(unicode)) {
2059 PyErr_BadInternalCall();
2060 return NULL;
2061 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002062 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002063 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002064
Victor Stinner87af4f22011-11-21 23:03:47 +01002065 length = PyUnicode_GET_LENGTH(unicode);
2066 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002067 if (!copy)
2068 return NULL;
2069 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2070
Victor Stinner87af4f22011-11-21 23:03:47 +01002071 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2072 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002073 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002074 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002075}
2076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077
Victor Stinnerbc603d12011-10-02 01:00:40 +02002078/* Widen Unicode objects to larger buffers. Don't write terminating null
2079 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080
2081void*
2082_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2083{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002084 Py_ssize_t len;
2085 void *result;
2086 unsigned int skind;
2087
Benjamin Petersonbac79492012-01-14 13:34:47 -05002088 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002089 return NULL;
2090
2091 len = PyUnicode_GET_LENGTH(s);
2092 skind = PyUnicode_KIND(s);
2093 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002094 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 return NULL;
2096 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002097 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002098 case PyUnicode_2BYTE_KIND:
2099 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2100 if (!result)
2101 return PyErr_NoMemory();
2102 assert(skind == PyUnicode_1BYTE_KIND);
2103 _PyUnicode_CONVERT_BYTES(
2104 Py_UCS1, Py_UCS2,
2105 PyUnicode_1BYTE_DATA(s),
2106 PyUnicode_1BYTE_DATA(s) + len,
2107 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002108 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002109 case PyUnicode_4BYTE_KIND:
2110 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2111 if (!result)
2112 return PyErr_NoMemory();
2113 if (skind == PyUnicode_2BYTE_KIND) {
2114 _PyUnicode_CONVERT_BYTES(
2115 Py_UCS2, Py_UCS4,
2116 PyUnicode_2BYTE_DATA(s),
2117 PyUnicode_2BYTE_DATA(s) + len,
2118 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002120 else {
2121 assert(skind == PyUnicode_1BYTE_KIND);
2122 _PyUnicode_CONVERT_BYTES(
2123 Py_UCS1, Py_UCS4,
2124 PyUnicode_1BYTE_DATA(s),
2125 PyUnicode_1BYTE_DATA(s) + len,
2126 result);
2127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002128 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002129 default:
2130 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131 }
Victor Stinner01698042011-10-04 00:04:26 +02002132 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 return NULL;
2134}
2135
2136static Py_UCS4*
2137as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2138 int copy_null)
2139{
2140 int kind;
2141 void *data;
2142 Py_ssize_t len, targetlen;
2143 if (PyUnicode_READY(string) == -1)
2144 return NULL;
2145 kind = PyUnicode_KIND(string);
2146 data = PyUnicode_DATA(string);
2147 len = PyUnicode_GET_LENGTH(string);
2148 targetlen = len;
2149 if (copy_null)
2150 targetlen++;
2151 if (!target) {
2152 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2153 PyErr_NoMemory();
2154 return NULL;
2155 }
2156 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2157 if (!target) {
2158 PyErr_NoMemory();
2159 return NULL;
2160 }
2161 }
2162 else {
2163 if (targetsize < targetlen) {
2164 PyErr_Format(PyExc_SystemError,
2165 "string is longer than the buffer");
2166 if (copy_null && 0 < targetsize)
2167 target[0] = 0;
2168 return NULL;
2169 }
2170 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002171 if (kind == PyUnicode_1BYTE_KIND) {
2172 Py_UCS1 *start = (Py_UCS1 *) data;
2173 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002175 else if (kind == PyUnicode_2BYTE_KIND) {
2176 Py_UCS2 *start = (Py_UCS2 *) data;
2177 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2178 }
2179 else {
2180 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 if (copy_null)
2184 target[len] = 0;
2185 return target;
2186}
2187
2188Py_UCS4*
2189PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2190 int copy_null)
2191{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002192 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 PyErr_BadInternalCall();
2194 return NULL;
2195 }
2196 return as_ucs4(string, target, targetsize, copy_null);
2197}
2198
2199Py_UCS4*
2200PyUnicode_AsUCS4Copy(PyObject *string)
2201{
2202 return as_ucs4(string, NULL, 0, 1);
2203}
2204
2205#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002206
Alexander Belopolsky40018472011-02-26 01:02:56 +00002207PyObject *
2208PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002211 if (size == 0) {
2212 Py_INCREF(unicode_empty);
2213 return unicode_empty;
2214 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002215 PyErr_BadInternalCall();
2216 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 }
2218
Martin v. Löwis790465f2008-04-05 20:41:37 +00002219 if (size == -1) {
2220 size = wcslen(w);
2221 }
2222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224}
2225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002227
Walter Dörwald346737f2007-05-31 10:44:43 +00002228static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002229makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2230 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002231{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002232 *fmt++ = '%';
2233 if (width) {
2234 if (zeropad)
2235 *fmt++ = '0';
2236 fmt += sprintf(fmt, "%d", width);
2237 }
2238 if (precision)
2239 fmt += sprintf(fmt, ".%d", precision);
2240 if (longflag)
2241 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002242 else if (longlongflag) {
2243 /* longlongflag should only ever be nonzero on machines with
2244 HAVE_LONG_LONG defined */
2245#ifdef HAVE_LONG_LONG
2246 char *f = PY_FORMAT_LONG_LONG;
2247 while (*f)
2248 *fmt++ = *f++;
2249#else
2250 /* we shouldn't ever get here */
2251 assert(0);
2252 *fmt++ = 'l';
2253#endif
2254 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002255 else if (size_tflag) {
2256 char *f = PY_FORMAT_SIZE_T;
2257 while (*f)
2258 *fmt++ = *f++;
2259 }
2260 *fmt++ = c;
2261 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002262}
2263
Victor Stinner96865452011-03-01 23:44:09 +00002264/* helper for PyUnicode_FromFormatV() */
2265
2266static const char*
2267parse_format_flags(const char *f,
2268 int *p_width, int *p_precision,
2269 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2270{
2271 int width, precision, longflag, longlongflag, size_tflag;
2272
2273 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2274 f++;
2275 width = 0;
2276 while (Py_ISDIGIT((unsigned)*f))
2277 width = (width*10) + *f++ - '0';
2278 precision = 0;
2279 if (*f == '.') {
2280 f++;
2281 while (Py_ISDIGIT((unsigned)*f))
2282 precision = (precision*10) + *f++ - '0';
2283 if (*f == '%') {
2284 /* "%.3%s" => f points to "3" */
2285 f--;
2286 }
2287 }
2288 if (*f == '\0') {
2289 /* bogus format "%.1" => go backward, f points to "1" */
2290 f--;
2291 }
2292 if (p_width != NULL)
2293 *p_width = width;
2294 if (p_precision != NULL)
2295 *p_precision = precision;
2296
2297 /* Handle %ld, %lu, %lld and %llu. */
2298 longflag = 0;
2299 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002300 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002301
2302 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002303 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002304 longflag = 1;
2305 ++f;
2306 }
2307#ifdef HAVE_LONG_LONG
2308 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002309 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002310 longlongflag = 1;
2311 f += 2;
2312 }
2313#endif
2314 }
2315 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002316 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002317 size_tflag = 1;
2318 ++f;
2319 }
2320 if (p_longflag != NULL)
2321 *p_longflag = longflag;
2322 if (p_longlongflag != NULL)
2323 *p_longlongflag = longlongflag;
2324 if (p_size_tflag != NULL)
2325 *p_size_tflag = size_tflag;
2326 return f;
2327}
2328
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002329/* maximum number of characters required for output of %ld. 21 characters
2330 allows for 64-bit integers (in decimal) and an optional sign. */
2331#define MAX_LONG_CHARS 21
2332/* maximum number of characters required for output of %lld.
2333 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2334 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2335#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2336
Walter Dörwaldd2034312007-05-18 16:29:38 +00002337PyObject *
2338PyUnicode_FromFormatV(const char *format, va_list vargs)
2339{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002340 va_list count;
2341 Py_ssize_t callcount = 0;
2342 PyObject **callresults = NULL;
2343 PyObject **callresult = NULL;
2344 Py_ssize_t n = 0;
2345 int width = 0;
2346 int precision = 0;
2347 int zeropad;
2348 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002349 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002350 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002351 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2353 Py_UCS4 argmaxchar;
2354 Py_ssize_t numbersize = 0;
2355 char *numberresults = NULL;
2356 char *numberresult = NULL;
2357 Py_ssize_t i;
2358 int kind;
2359 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002360
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002361 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002362 /* step 1: count the number of %S/%R/%A/%s format specifications
2363 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2364 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002365 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002366 * also estimate a upper bound for all the number formats in the string,
2367 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002368 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002369 for (f = format; *f; f++) {
2370 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002371 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002372 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2373 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2374 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2375 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002378#ifdef HAVE_LONG_LONG
2379 if (longlongflag) {
2380 if (width < MAX_LONG_LONG_CHARS)
2381 width = MAX_LONG_LONG_CHARS;
2382 }
2383 else
2384#endif
2385 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2386 including sign. Decimal takes the most space. This
2387 isn't enough for octal. If a width is specified we
2388 need more (which we allocate later). */
2389 if (width < MAX_LONG_CHARS)
2390 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391
2392 /* account for the size + '\0' to separate numbers
2393 inside of the numberresults buffer */
2394 numbersize += (width + 1);
2395 }
2396 }
2397 else if ((unsigned char)*f > 127) {
2398 PyErr_Format(PyExc_ValueError,
2399 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2400 "string, got a non-ASCII byte: 0x%02x",
2401 (unsigned char)*f);
2402 return NULL;
2403 }
2404 }
2405 /* step 2: allocate memory for the results of
2406 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2407 if (callcount) {
2408 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2409 if (!callresults) {
2410 PyErr_NoMemory();
2411 return NULL;
2412 }
2413 callresult = callresults;
2414 }
2415 /* step 2.5: allocate memory for the results of formating numbers */
2416 if (numbersize) {
2417 numberresults = PyObject_Malloc(numbersize);
2418 if (!numberresults) {
2419 PyErr_NoMemory();
2420 goto fail;
2421 }
2422 numberresult = numberresults;
2423 }
2424
2425 /* step 3: format numbers and figure out how large a buffer we need */
2426 for (f = format; *f; f++) {
2427 if (*f == '%') {
2428 const char* p;
2429 int longflag;
2430 int longlongflag;
2431 int size_tflag;
2432 int numprinted;
2433
2434 p = f;
2435 zeropad = (f[1] == '0');
2436 f = parse_format_flags(f, &width, &precision,
2437 &longflag, &longlongflag, &size_tflag);
2438 switch (*f) {
2439 case 'c':
2440 {
2441 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002442 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 n++;
2444 break;
2445 }
2446 case '%':
2447 n++;
2448 break;
2449 case 'i':
2450 case 'd':
2451 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2452 width, precision, *f);
2453 if (longflag)
2454 numprinted = sprintf(numberresult, fmt,
2455 va_arg(count, long));
2456#ifdef HAVE_LONG_LONG
2457 else if (longlongflag)
2458 numprinted = sprintf(numberresult, fmt,
2459 va_arg(count, PY_LONG_LONG));
2460#endif
2461 else if (size_tflag)
2462 numprinted = sprintf(numberresult, fmt,
2463 va_arg(count, Py_ssize_t));
2464 else
2465 numprinted = sprintf(numberresult, fmt,
2466 va_arg(count, int));
2467 n += numprinted;
2468 /* advance by +1 to skip over the '\0' */
2469 numberresult += (numprinted + 1);
2470 assert(*(numberresult - 1) == '\0');
2471 assert(*(numberresult - 2) != '\0');
2472 assert(numprinted >= 0);
2473 assert(numberresult <= numberresults + numbersize);
2474 break;
2475 case 'u':
2476 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2477 width, precision, 'u');
2478 if (longflag)
2479 numprinted = sprintf(numberresult, fmt,
2480 va_arg(count, unsigned long));
2481#ifdef HAVE_LONG_LONG
2482 else if (longlongflag)
2483 numprinted = sprintf(numberresult, fmt,
2484 va_arg(count, unsigned PY_LONG_LONG));
2485#endif
2486 else if (size_tflag)
2487 numprinted = sprintf(numberresult, fmt,
2488 va_arg(count, size_t));
2489 else
2490 numprinted = sprintf(numberresult, fmt,
2491 va_arg(count, unsigned int));
2492 n += numprinted;
2493 numberresult += (numprinted + 1);
2494 assert(*(numberresult - 1) == '\0');
2495 assert(*(numberresult - 2) != '\0');
2496 assert(numprinted >= 0);
2497 assert(numberresult <= numberresults + numbersize);
2498 break;
2499 case 'x':
2500 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2501 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2502 n += numprinted;
2503 numberresult += (numprinted + 1);
2504 assert(*(numberresult - 1) == '\0');
2505 assert(*(numberresult - 2) != '\0');
2506 assert(numprinted >= 0);
2507 assert(numberresult <= numberresults + numbersize);
2508 break;
2509 case 'p':
2510 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2511 /* %p is ill-defined: ensure leading 0x. */
2512 if (numberresult[1] == 'X')
2513 numberresult[1] = 'x';
2514 else if (numberresult[1] != 'x') {
2515 memmove(numberresult + 2, numberresult,
2516 strlen(numberresult) + 1);
2517 numberresult[0] = '0';
2518 numberresult[1] = 'x';
2519 numprinted += 2;
2520 }
2521 n += numprinted;
2522 numberresult += (numprinted + 1);
2523 assert(*(numberresult - 1) == '\0');
2524 assert(*(numberresult - 2) != '\0');
2525 assert(numprinted >= 0);
2526 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002527 break;
2528 case 's':
2529 {
2530 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002531 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002532 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002533 if (!str)
2534 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002535 /* since PyUnicode_DecodeUTF8 returns already flexible
2536 unicode objects, there is no need to call ready on them */
2537 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002538 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002540 /* Remember the str and switch to the next slot */
2541 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 break;
2543 }
2544 case 'U':
2545 {
2546 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002547 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548 if (PyUnicode_READY(obj) == -1)
2549 goto fail;
2550 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002551 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 break;
2554 }
2555 case 'V':
2556 {
2557 PyObject *obj = va_arg(count, PyObject *);
2558 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002559 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002561 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002562 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 if (PyUnicode_READY(obj) == -1)
2564 goto fail;
2565 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002566 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002568 *callresult++ = NULL;
2569 }
2570 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002571 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002572 if (!str_obj)
2573 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002574 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002575 Py_DECREF(str_obj);
2576 goto fail;
2577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002579 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002581 *callresult++ = str_obj;
2582 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 break;
2584 }
2585 case 'S':
2586 {
2587 PyObject *obj = va_arg(count, PyObject *);
2588 PyObject *str;
2589 assert(obj);
2590 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002591 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002592 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002593 if (PyUnicode_READY(str) == -1) {
2594 Py_DECREF(str);
2595 goto fail;
2596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002598 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002599 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002600 /* Remember the str and switch to the next slot */
2601 *callresult++ = str;
2602 break;
2603 }
2604 case 'R':
2605 {
2606 PyObject *obj = va_arg(count, PyObject *);
2607 PyObject *repr;
2608 assert(obj);
2609 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002610 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002611 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002612 if (PyUnicode_READY(repr) == -1) {
2613 Py_DECREF(repr);
2614 goto fail;
2615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002617 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002619 /* Remember the repr and switch to the next slot */
2620 *callresult++ = repr;
2621 break;
2622 }
2623 case 'A':
2624 {
2625 PyObject *obj = va_arg(count, PyObject *);
2626 PyObject *ascii;
2627 assert(obj);
2628 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002629 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002630 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002631 if (PyUnicode_READY(ascii) == -1) {
2632 Py_DECREF(ascii);
2633 goto fail;
2634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002636 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 /* Remember the repr and switch to the next slot */
2639 *callresult++ = ascii;
2640 break;
2641 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002642 default:
2643 /* if we stumble upon an unknown
2644 formatting code, copy the rest of
2645 the format string to the output
2646 string. (we cannot just skip the
2647 code, since there's no way to know
2648 what's in the argument list) */
2649 n += strlen(p);
2650 goto expand;
2651 }
2652 } else
2653 n++;
2654 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002655 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002658 we don't have to resize the string.
2659 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002660 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 if (!string)
2662 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 kind = PyUnicode_KIND(string);
2664 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002665 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002666 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002669 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002670 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002671
2672 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2674 /* checking for == because the last argument could be a empty
2675 string, which causes i to point to end, the assert at the end of
2676 the loop */
2677 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002678
Benjamin Peterson14339b62009-01-31 16:36:08 +00002679 switch (*f) {
2680 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002681 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682 const int ordinal = va_arg(vargs, int);
2683 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002685 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002686 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002687 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002689 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002691 {
2692 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002693 /* unused, since we already have the result */
2694 if (*f == 'p')
2695 (void) va_arg(vargs, void *);
2696 else
2697 (void) va_arg(vargs, int);
2698 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002699 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002701 i += written;
2702 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 assert(*numberresult == '\0');
2704 numberresult++;
2705 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 case 's':
2709 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002710 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002712 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002713 size = PyUnicode_GET_LENGTH(*callresult);
2714 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002715 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002717 /* We're done with the unicode()/repr() => forget it */
2718 Py_DECREF(*callresult);
2719 /* switch to next unicode()/repr() result */
2720 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002721 break;
2722 }
2723 case 'U':
2724 {
2725 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002726 Py_ssize_t size;
2727 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2728 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002729 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002730 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002731 break;
2732 }
2733 case 'V':
2734 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002737 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002738 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 size = PyUnicode_GET_LENGTH(obj);
2740 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002741 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002743 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002744 size = PyUnicode_GET_LENGTH(*callresult);
2745 assert(PyUnicode_KIND(*callresult) <=
2746 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002747 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002749 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002750 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002751 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002752 break;
2753 }
2754 case 'S':
2755 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002756 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002757 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002758 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002759 /* unused, since we already have the result */
2760 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002762 copy_characters(string, i, *callresult, 0, size);
2763 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002764 /* We're done with the unicode()/repr() => forget it */
2765 Py_DECREF(*callresult);
2766 /* switch to next unicode()/repr() result */
2767 ++callresult;
2768 break;
2769 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002770 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002771 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002772 break;
2773 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002774 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002776 goto end;
2777 }
Victor Stinner1205f272010-09-11 00:54:47 +00002778 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 else {
2780 assert(i < PyUnicode_GET_LENGTH(string));
2781 PyUnicode_WRITE(kind, data, i++, *f);
2782 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002787 if (callresults)
2788 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789 if (numberresults)
2790 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002791 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002792 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002793 if (callresults) {
2794 PyObject **callresult2 = callresults;
2795 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002796 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002797 ++callresult2;
2798 }
2799 PyObject_Free(callresults);
2800 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 if (numberresults)
2802 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002803 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002804}
2805
Walter Dörwaldd2034312007-05-18 16:29:38 +00002806PyObject *
2807PyUnicode_FromFormat(const char *format, ...)
2808{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002809 PyObject* ret;
2810 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002811
2812#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002813 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002814#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002815 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002816#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002817 ret = PyUnicode_FromFormatV(format, vargs);
2818 va_end(vargs);
2819 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002820}
2821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002822#ifdef HAVE_WCHAR_H
2823
Victor Stinner5593d8a2010-10-02 11:11:27 +00002824/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2825 convert a Unicode object to a wide character string.
2826
Victor Stinnerd88d9832011-09-06 02:00:05 +02002827 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002828 character) required to convert the unicode object. Ignore size argument.
2829
Victor Stinnerd88d9832011-09-06 02:00:05 +02002830 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002831 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002832 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002833static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002834unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002835 wchar_t *w,
2836 Py_ssize_t size)
2837{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002838 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002839 const wchar_t *wstr;
2840
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002841 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002842 if (wstr == NULL)
2843 return -1;
2844
Victor Stinner5593d8a2010-10-02 11:11:27 +00002845 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002846 if (size > res)
2847 size = res + 1;
2848 else
2849 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002850 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002851 return res;
2852 }
2853 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002854 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002855}
2856
2857Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002858PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002859 wchar_t *w,
2860 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861{
2862 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002863 PyErr_BadInternalCall();
2864 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002866 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867}
2868
Victor Stinner137c34c2010-09-29 10:25:54 +00002869wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002870PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002871 Py_ssize_t *size)
2872{
2873 wchar_t* buffer;
2874 Py_ssize_t buflen;
2875
2876 if (unicode == NULL) {
2877 PyErr_BadInternalCall();
2878 return NULL;
2879 }
2880
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002881 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002882 if (buflen == -1)
2883 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002884 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002885 PyErr_NoMemory();
2886 return NULL;
2887 }
2888
Victor Stinner137c34c2010-09-29 10:25:54 +00002889 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2890 if (buffer == NULL) {
2891 PyErr_NoMemory();
2892 return NULL;
2893 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002894 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002895 if (buflen == -1)
2896 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002897 if (size != NULL)
2898 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002899 return buffer;
2900}
2901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002902#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903
Alexander Belopolsky40018472011-02-26 01:02:56 +00002904PyObject *
2905PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002907 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002908 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002909 PyErr_SetString(PyExc_ValueError,
2910 "chr() arg not in range(0x110000)");
2911 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002912 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002914 if (ordinal < 256)
2915 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002917 v = PyUnicode_New(1, ordinal);
2918 if (v == NULL)
2919 return NULL;
2920 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002921 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002922 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002923}
2924
Alexander Belopolsky40018472011-02-26 01:02:56 +00002925PyObject *
2926PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002927{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002928 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002929 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002930 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002931 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002932 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002933 Py_INCREF(obj);
2934 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002935 }
2936 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002937 /* For a Unicode subtype that's not a Unicode object,
2938 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002939 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002940 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002941 PyErr_Format(PyExc_TypeError,
2942 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002943 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002944 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002945}
2946
Alexander Belopolsky40018472011-02-26 01:02:56 +00002947PyObject *
2948PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002949 const char *encoding,
2950 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002951{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002952 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002953 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002954
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002956 PyErr_BadInternalCall();
2957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002959
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002960 /* Decoding bytes objects is the most common case and should be fast */
2961 if (PyBytes_Check(obj)) {
2962 if (PyBytes_GET_SIZE(obj) == 0) {
2963 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002964 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002965 }
2966 else {
2967 v = PyUnicode_Decode(
2968 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2969 encoding, errors);
2970 }
2971 return v;
2972 }
2973
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002974 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 PyErr_SetString(PyExc_TypeError,
2976 "decoding str is not supported");
2977 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002978 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002979
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002980 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2981 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2982 PyErr_Format(PyExc_TypeError,
2983 "coercing to str: need bytes, bytearray "
2984 "or buffer-like object, %.80s found",
2985 Py_TYPE(obj)->tp_name);
2986 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002987 }
Tim Petersced69f82003-09-16 20:30:58 +00002988
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002989 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002990 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002991 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992 }
Tim Petersced69f82003-09-16 20:30:58 +00002993 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002994 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002995
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002996 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002997 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998}
2999
Victor Stinner600d3be2010-06-10 12:00:55 +00003000/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003001 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3002 1 on success. */
3003static int
3004normalize_encoding(const char *encoding,
3005 char *lower,
3006 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003008 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003009 char *l;
3010 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003011
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003012 if (encoding == NULL) {
3013 strcpy(lower, "utf-8");
3014 return 1;
3015 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003016 e = encoding;
3017 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003018 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003019 while (*e) {
3020 if (l == l_end)
3021 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003022 if (Py_ISUPPER(*e)) {
3023 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003024 }
3025 else if (*e == '_') {
3026 *l++ = '-';
3027 e++;
3028 }
3029 else {
3030 *l++ = *e++;
3031 }
3032 }
3033 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003034 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003035}
3036
Alexander Belopolsky40018472011-02-26 01:02:56 +00003037PyObject *
3038PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003039 Py_ssize_t size,
3040 const char *encoding,
3041 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003042{
3043 PyObject *buffer = NULL, *unicode;
3044 Py_buffer info;
3045 char lower[11]; /* Enough for any encoding shortcut */
3046
Fred Drakee4315f52000-05-09 19:53:39 +00003047 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003048 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003049 if ((strcmp(lower, "utf-8") == 0) ||
3050 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003051 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003052 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003053 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003054 (strcmp(lower, "iso-8859-1") == 0))
3055 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003056#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003057 else if (strcmp(lower, "mbcs") == 0)
3058 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003059#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003060 else if (strcmp(lower, "ascii") == 0)
3061 return PyUnicode_DecodeASCII(s, size, errors);
3062 else if (strcmp(lower, "utf-16") == 0)
3063 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3064 else if (strcmp(lower, "utf-32") == 0)
3065 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067
3068 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003069 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003070 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003071 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003072 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 if (buffer == NULL)
3074 goto onError;
3075 unicode = PyCodec_Decode(buffer, encoding, errors);
3076 if (unicode == NULL)
3077 goto onError;
3078 if (!PyUnicode_Check(unicode)) {
3079 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003080 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003081 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 Py_DECREF(unicode);
3083 goto onError;
3084 }
3085 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003086 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003087
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 Py_XDECREF(buffer);
3090 return NULL;
3091}
3092
Alexander Belopolsky40018472011-02-26 01:02:56 +00003093PyObject *
3094PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003095 const char *encoding,
3096 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003097{
3098 PyObject *v;
3099
3100 if (!PyUnicode_Check(unicode)) {
3101 PyErr_BadArgument();
3102 goto onError;
3103 }
3104
3105 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003106 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003107
3108 /* Decode via the codec registry */
3109 v = PyCodec_Decode(unicode, encoding, errors);
3110 if (v == NULL)
3111 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003112 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003113
Benjamin Peterson29060642009-01-31 22:14:21 +00003114 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003115 return NULL;
3116}
3117
Alexander Belopolsky40018472011-02-26 01:02:56 +00003118PyObject *
3119PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003120 const char *encoding,
3121 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003122{
3123 PyObject *v;
3124
3125 if (!PyUnicode_Check(unicode)) {
3126 PyErr_BadArgument();
3127 goto onError;
3128 }
3129
3130 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003131 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003132
3133 /* Decode via the codec registry */
3134 v = PyCodec_Decode(unicode, encoding, errors);
3135 if (v == NULL)
3136 goto onError;
3137 if (!PyUnicode_Check(v)) {
3138 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003139 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003140 Py_TYPE(v)->tp_name);
3141 Py_DECREF(v);
3142 goto onError;
3143 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003144 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003145
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003147 return NULL;
3148}
3149
Alexander Belopolsky40018472011-02-26 01:02:56 +00003150PyObject *
3151PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003152 Py_ssize_t size,
3153 const char *encoding,
3154 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155{
3156 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003157
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 unicode = PyUnicode_FromUnicode(s, size);
3159 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3162 Py_DECREF(unicode);
3163 return v;
3164}
3165
Alexander Belopolsky40018472011-02-26 01:02:56 +00003166PyObject *
3167PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003168 const char *encoding,
3169 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003170{
3171 PyObject *v;
3172
3173 if (!PyUnicode_Check(unicode)) {
3174 PyErr_BadArgument();
3175 goto onError;
3176 }
3177
3178 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003179 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003180
3181 /* Encode via the codec registry */
3182 v = PyCodec_Encode(unicode, encoding, errors);
3183 if (v == NULL)
3184 goto onError;
3185 return v;
3186
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003188 return NULL;
3189}
3190
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003191static size_t
3192wcstombs_errorpos(const wchar_t *wstr)
3193{
3194 size_t len;
3195#if SIZEOF_WCHAR_T == 2
3196 wchar_t buf[3];
3197#else
3198 wchar_t buf[2];
3199#endif
3200 char outbuf[MB_LEN_MAX];
3201 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003202
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003203#if SIZEOF_WCHAR_T == 2
3204 buf[2] = 0;
3205#else
3206 buf[1] = 0;
3207#endif
3208 start = wstr;
3209 while (*wstr != L'\0')
3210 {
3211 previous = wstr;
3212#if SIZEOF_WCHAR_T == 2
3213 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3214 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3215 {
3216 buf[0] = wstr[0];
3217 buf[1] = wstr[1];
3218 wstr += 2;
3219 }
3220 else {
3221 buf[0] = *wstr;
3222 buf[1] = 0;
3223 wstr++;
3224 }
3225#else
3226 buf[0] = *wstr;
3227 wstr++;
3228#endif
3229 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003230 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003231 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003232 }
3233
3234 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003235 return 0;
3236}
3237
Victor Stinner1b579672011-12-17 05:47:23 +01003238static int
3239locale_error_handler(const char *errors, int *surrogateescape)
3240{
3241 if (errors == NULL) {
3242 *surrogateescape = 0;
3243 return 0;
3244 }
3245
3246 if (strcmp(errors, "strict") == 0) {
3247 *surrogateescape = 0;
3248 return 0;
3249 }
3250 if (strcmp(errors, "surrogateescape") == 0) {
3251 *surrogateescape = 1;
3252 return 0;
3253 }
3254 PyErr_Format(PyExc_ValueError,
3255 "only 'strict' and 'surrogateescape' error handlers "
3256 "are supported, not '%s'",
3257 errors);
3258 return -1;
3259}
3260
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003261PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003262PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003263{
3264 Py_ssize_t wlen, wlen2;
3265 wchar_t *wstr;
3266 PyObject *bytes = NULL;
3267 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003268 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003269 PyObject *exc;
3270 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003271 int surrogateescape;
3272
3273 if (locale_error_handler(errors, &surrogateescape) < 0)
3274 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003275
3276 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3277 if (wstr == NULL)
3278 return NULL;
3279
3280 wlen2 = wcslen(wstr);
3281 if (wlen2 != wlen) {
3282 PyMem_Free(wstr);
3283 PyErr_SetString(PyExc_TypeError, "embedded null character");
3284 return NULL;
3285 }
3286
3287 if (surrogateescape) {
3288 /* locale encoding with surrogateescape */
3289 char *str;
3290
3291 str = _Py_wchar2char(wstr, &error_pos);
3292 if (str == NULL) {
3293 if (error_pos == (size_t)-1) {
3294 PyErr_NoMemory();
3295 PyMem_Free(wstr);
3296 return NULL;
3297 }
3298 else {
3299 goto encode_error;
3300 }
3301 }
3302 PyMem_Free(wstr);
3303
3304 bytes = PyBytes_FromString(str);
3305 PyMem_Free(str);
3306 }
3307 else {
3308 size_t len, len2;
3309
3310 len = wcstombs(NULL, wstr, 0);
3311 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003312 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003313 goto encode_error;
3314 }
3315
3316 bytes = PyBytes_FromStringAndSize(NULL, len);
3317 if (bytes == NULL) {
3318 PyMem_Free(wstr);
3319 return NULL;
3320 }
3321
3322 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3323 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003324 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003325 goto encode_error;
3326 }
3327 PyMem_Free(wstr);
3328 }
3329 return bytes;
3330
3331encode_error:
3332 errmsg = strerror(errno);
3333 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003334
3335 if (error_pos == (size_t)-1)
3336 error_pos = wcstombs_errorpos(wstr);
3337
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003338 PyMem_Free(wstr);
3339 Py_XDECREF(bytes);
3340
Victor Stinner2f197072011-12-17 07:08:30 +01003341 if (errmsg != NULL) {
3342 size_t errlen;
3343 wstr = _Py_char2wchar(errmsg, &errlen);
3344 if (wstr != NULL) {
3345 reason = PyUnicode_FromWideChar(wstr, errlen);
3346 PyMem_Free(wstr);
3347 } else
3348 errmsg = NULL;
3349 }
3350 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003351 reason = PyUnicode_FromString(
3352 "wcstombs() encountered an unencodable "
3353 "wide character");
3354 if (reason == NULL)
3355 return NULL;
3356
3357 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3358 "locale", unicode,
3359 (Py_ssize_t)error_pos,
3360 (Py_ssize_t)(error_pos+1),
3361 reason);
3362 Py_DECREF(reason);
3363 if (exc != NULL) {
3364 PyCodec_StrictErrors(exc);
3365 Py_XDECREF(exc);
3366 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003367 return NULL;
3368}
3369
Victor Stinnerad158722010-10-27 00:25:46 +00003370PyObject *
3371PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003372{
Victor Stinner99b95382011-07-04 14:23:54 +02003373#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003374 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003375#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003376 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003377#else
Victor Stinner793b5312011-04-27 00:24:21 +02003378 PyInterpreterState *interp = PyThreadState_GET()->interp;
3379 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3380 cannot use it to encode and decode filenames before it is loaded. Load
3381 the Python codec requires to encode at least its own filename. Use the C
3382 version of the locale codec until the codec registry is initialized and
3383 the Python codec is loaded.
3384
3385 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3386 cannot only rely on it: check also interp->fscodec_initialized for
3387 subinterpreters. */
3388 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003389 return PyUnicode_AsEncodedString(unicode,
3390 Py_FileSystemDefaultEncoding,
3391 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003392 }
3393 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003394 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003395 }
Victor Stinnerad158722010-10-27 00:25:46 +00003396#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003397}
3398
Alexander Belopolsky40018472011-02-26 01:02:56 +00003399PyObject *
3400PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003401 const char *encoding,
3402 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403{
3404 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003405 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003406
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 if (!PyUnicode_Check(unicode)) {
3408 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003409 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 }
Fred Drakee4315f52000-05-09 19:53:39 +00003411
Fred Drakee4315f52000-05-09 19:53:39 +00003412 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003413 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003414 if ((strcmp(lower, "utf-8") == 0) ||
3415 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003416 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003417 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003418 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003419 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003420 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003421 }
Victor Stinner37296e82010-06-10 13:36:23 +00003422 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003423 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003424 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003425 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003426#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003427 else if (strcmp(lower, "mbcs") == 0)
3428 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003429#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003430 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003431 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433
3434 /* Encode via the codec registry */
3435 v = PyCodec_Encode(unicode, encoding, errors);
3436 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003437 return NULL;
3438
3439 /* The normal path */
3440 if (PyBytes_Check(v))
3441 return v;
3442
3443 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003444 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003445 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003446 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003447
3448 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3449 "encoder %s returned bytearray instead of bytes",
3450 encoding);
3451 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003452 Py_DECREF(v);
3453 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003454 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003455
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003456 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3457 Py_DECREF(v);
3458 return b;
3459 }
3460
3461 PyErr_Format(PyExc_TypeError,
3462 "encoder did not return a bytes object (type=%.400s)",
3463 Py_TYPE(v)->tp_name);
3464 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003465 return NULL;
3466}
3467
Alexander Belopolsky40018472011-02-26 01:02:56 +00003468PyObject *
3469PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003470 const char *encoding,
3471 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003472{
3473 PyObject *v;
3474
3475 if (!PyUnicode_Check(unicode)) {
3476 PyErr_BadArgument();
3477 goto onError;
3478 }
3479
3480 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003481 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003482
3483 /* Encode via the codec registry */
3484 v = PyCodec_Encode(unicode, encoding, errors);
3485 if (v == NULL)
3486 goto onError;
3487 if (!PyUnicode_Check(v)) {
3488 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003489 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003490 Py_TYPE(v)->tp_name);
3491 Py_DECREF(v);
3492 goto onError;
3493 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003495
Benjamin Peterson29060642009-01-31 22:14:21 +00003496 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 return NULL;
3498}
3499
Victor Stinner2f197072011-12-17 07:08:30 +01003500static size_t
3501mbstowcs_errorpos(const char *str, size_t len)
3502{
3503#ifdef HAVE_MBRTOWC
3504 const char *start = str;
3505 mbstate_t mbs;
3506 size_t converted;
3507 wchar_t ch;
3508
3509 memset(&mbs, 0, sizeof mbs);
3510 while (len)
3511 {
3512 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3513 if (converted == 0)
3514 /* Reached end of string */
3515 break;
3516 if (converted == (size_t)-1 || converted == (size_t)-2) {
3517 /* Conversion error or incomplete character */
3518 return str - start;
3519 }
3520 else {
3521 str += converted;
3522 len -= converted;
3523 }
3524 }
3525 /* failed to find the undecodable byte sequence */
3526 return 0;
3527#endif
3528 return 0;
3529}
3530
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003531PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003532PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003533 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003534{
3535 wchar_t smallbuf[256];
3536 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3537 wchar_t *wstr;
3538 size_t wlen, wlen2;
3539 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003540 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003541 size_t error_pos;
3542 char *errmsg;
3543 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003544
3545 if (locale_error_handler(errors, &surrogateescape) < 0)
3546 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003547
3548 if (str[len] != '\0' || len != strlen(str)) {
3549 PyErr_SetString(PyExc_TypeError, "embedded null character");
3550 return NULL;
3551 }
3552
3553 if (surrogateescape)
3554 {
3555 wstr = _Py_char2wchar(str, &wlen);
3556 if (wstr == NULL) {
3557 if (wlen == (size_t)-1)
3558 PyErr_NoMemory();
3559 else
3560 PyErr_SetFromErrno(PyExc_OSError);
3561 return NULL;
3562 }
3563
3564 unicode = PyUnicode_FromWideChar(wstr, wlen);
3565 PyMem_Free(wstr);
3566 }
3567 else {
3568#ifndef HAVE_BROKEN_MBSTOWCS
3569 wlen = mbstowcs(NULL, str, 0);
3570#else
3571 wlen = len;
3572#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003573 if (wlen == (size_t)-1)
3574 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003575 if (wlen+1 <= smallbuf_len) {
3576 wstr = smallbuf;
3577 }
3578 else {
3579 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3580 return PyErr_NoMemory();
3581
3582 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3583 if (!wstr)
3584 return PyErr_NoMemory();
3585 }
3586
3587 /* This shouldn't fail now */
3588 wlen2 = mbstowcs(wstr, str, wlen+1);
3589 if (wlen2 == (size_t)-1) {
3590 if (wstr != smallbuf)
3591 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003592 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003593 }
3594#ifdef HAVE_BROKEN_MBSTOWCS
3595 assert(wlen2 == wlen);
3596#endif
3597 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3598 if (wstr != smallbuf)
3599 PyMem_Free(wstr);
3600 }
3601 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003602
3603decode_error:
3604 errmsg = strerror(errno);
3605 assert(errmsg != NULL);
3606
3607 error_pos = mbstowcs_errorpos(str, len);
3608 if (errmsg != NULL) {
3609 size_t errlen;
3610 wstr = _Py_char2wchar(errmsg, &errlen);
3611 if (wstr != NULL) {
3612 reason = PyUnicode_FromWideChar(wstr, errlen);
3613 PyMem_Free(wstr);
3614 } else
3615 errmsg = NULL;
3616 }
3617 if (errmsg == NULL)
3618 reason = PyUnicode_FromString(
3619 "mbstowcs() encountered an invalid multibyte sequence");
3620 if (reason == NULL)
3621 return NULL;
3622
3623 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3624 "locale", str, len,
3625 (Py_ssize_t)error_pos,
3626 (Py_ssize_t)(error_pos+1),
3627 reason);
3628 Py_DECREF(reason);
3629 if (exc != NULL) {
3630 PyCodec_StrictErrors(exc);
3631 Py_XDECREF(exc);
3632 }
3633 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003634}
3635
3636PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003637PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003638{
3639 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003640 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003641}
3642
3643
3644PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003645PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003646 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003647 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3648}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003649
Christian Heimes5894ba72007-11-04 11:43:14 +00003650PyObject*
3651PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3652{
Victor Stinner99b95382011-07-04 14:23:54 +02003653#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003654 return PyUnicode_DecodeMBCS(s, size, NULL);
3655#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003656 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003657#else
Victor Stinner793b5312011-04-27 00:24:21 +02003658 PyInterpreterState *interp = PyThreadState_GET()->interp;
3659 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3660 cannot use it to encode and decode filenames before it is loaded. Load
3661 the Python codec requires to encode at least its own filename. Use the C
3662 version of the locale codec until the codec registry is initialized and
3663 the Python codec is loaded.
3664
3665 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3666 cannot only rely on it: check also interp->fscodec_initialized for
3667 subinterpreters. */
3668 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003669 return PyUnicode_Decode(s, size,
3670 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003671 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003672 }
3673 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003674 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003675 }
Victor Stinnerad158722010-10-27 00:25:46 +00003676#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003677}
3678
Martin v. Löwis011e8422009-05-05 04:43:17 +00003679
3680int
Antoine Pitrou13348842012-01-29 18:36:34 +01003681_PyUnicode_HasNULChars(PyObject* s)
3682{
3683 static PyObject *nul = NULL;
3684
3685 if (nul == NULL)
3686 nul = PyUnicode_FromStringAndSize("\0", 1);
3687 if (nul == NULL)
3688 return -1;
3689 return PyUnicode_Contains(s, nul);
3690}
3691
3692
3693int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003694PyUnicode_FSConverter(PyObject* arg, void* addr)
3695{
3696 PyObject *output = NULL;
3697 Py_ssize_t size;
3698 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003699 if (arg == NULL) {
3700 Py_DECREF(*(PyObject**)addr);
3701 return 1;
3702 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003703 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003704 output = arg;
3705 Py_INCREF(output);
3706 }
3707 else {
3708 arg = PyUnicode_FromObject(arg);
3709 if (!arg)
3710 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003711 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003712 Py_DECREF(arg);
3713 if (!output)
3714 return 0;
3715 if (!PyBytes_Check(output)) {
3716 Py_DECREF(output);
3717 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3718 return 0;
3719 }
3720 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003721 size = PyBytes_GET_SIZE(output);
3722 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003723 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003724 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003725 Py_DECREF(output);
3726 return 0;
3727 }
3728 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003729 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003730}
3731
3732
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003733int
3734PyUnicode_FSDecoder(PyObject* arg, void* addr)
3735{
3736 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003737 if (arg == NULL) {
3738 Py_DECREF(*(PyObject**)addr);
3739 return 1;
3740 }
3741 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003742 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003743 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003744 output = arg;
3745 Py_INCREF(output);
3746 }
3747 else {
3748 arg = PyBytes_FromObject(arg);
3749 if (!arg)
3750 return 0;
3751 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3752 PyBytes_GET_SIZE(arg));
3753 Py_DECREF(arg);
3754 if (!output)
3755 return 0;
3756 if (!PyUnicode_Check(output)) {
3757 Py_DECREF(output);
3758 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3759 return 0;
3760 }
3761 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003762 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003763 Py_DECREF(output);
3764 return 0;
3765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003767 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003768 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3769 Py_DECREF(output);
3770 return 0;
3771 }
3772 *(PyObject**)addr = output;
3773 return Py_CLEANUP_SUPPORTED;
3774}
3775
3776
Martin v. Löwis5b222132007-06-10 09:51:05 +00003777char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003779{
Christian Heimesf3863112007-11-22 07:46:41 +00003780 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003782 if (!PyUnicode_Check(unicode)) {
3783 PyErr_BadArgument();
3784 return NULL;
3785 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003786 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003787 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003789 if (PyUnicode_UTF8(unicode) == NULL) {
3790 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3792 if (bytes == NULL)
3793 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003794 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3795 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796 Py_DECREF(bytes);
3797 return NULL;
3798 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003799 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3800 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3801 PyBytes_AS_STRING(bytes),
3802 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803 Py_DECREF(bytes);
3804 }
3805
3806 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003807 *psize = PyUnicode_UTF8_LENGTH(unicode);
3808 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003809}
3810
3811char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003813{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3815}
3816
3817#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003818static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819#endif
3820
3821
3822Py_UNICODE *
3823PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825 const unsigned char *one_byte;
3826#if SIZEOF_WCHAR_T == 4
3827 const Py_UCS2 *two_bytes;
3828#else
3829 const Py_UCS4 *four_bytes;
3830 const Py_UCS4 *ucs4_end;
3831 Py_ssize_t num_surrogates;
3832#endif
3833 wchar_t *w;
3834 wchar_t *wchar_end;
3835
3836 if (!PyUnicode_Check(unicode)) {
3837 PyErr_BadArgument();
3838 return NULL;
3839 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003840 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003842 assert(_PyUnicode_KIND(unicode) != 0);
3843 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844
3845#ifdef Py_DEBUG
3846 ++unicode_as_unicode_calls;
3847#endif
3848
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003849 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003851 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3852 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 num_surrogates = 0;
3854
3855 for (; four_bytes < ucs4_end; ++four_bytes) {
3856 if (*four_bytes > 0xFFFF)
3857 ++num_surrogates;
3858 }
3859
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003860 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3861 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3862 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 PyErr_NoMemory();
3864 return NULL;
3865 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003868 w = _PyUnicode_WSTR(unicode);
3869 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3870 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3872 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003873 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003875 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3876 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 }
3878 else
3879 *w = *four_bytes;
3880
3881 if (w > wchar_end) {
3882 assert(0 && "Miscalculated string end");
3883 }
3884 }
3885 *w = 0;
3886#else
3887 /* sizeof(wchar_t) == 4 */
3888 Py_FatalError("Impossible unicode object state, wstr and str "
3889 "should share memory already.");
3890 return NULL;
3891#endif
3892 }
3893 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003894 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3895 (_PyUnicode_LENGTH(unicode) + 1));
3896 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897 PyErr_NoMemory();
3898 return NULL;
3899 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3901 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3902 w = _PyUnicode_WSTR(unicode);
3903 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003904
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003905 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3906 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 for (; w < wchar_end; ++one_byte, ++w)
3908 *w = *one_byte;
3909 /* null-terminate the wstr */
3910 *w = 0;
3911 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003912 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003913#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003914 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 for (; w < wchar_end; ++two_bytes, ++w)
3916 *w = *two_bytes;
3917 /* null-terminate the wstr */
3918 *w = 0;
3919#else
3920 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003921 PyObject_FREE(_PyUnicode_WSTR(unicode));
3922 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 Py_FatalError("Impossible unicode object state, wstr "
3924 "and str should share memory already.");
3925 return NULL;
3926#endif
3927 }
3928 else {
3929 assert(0 && "This should never happen.");
3930 }
3931 }
3932 }
3933 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003934 *size = PyUnicode_WSTR_LENGTH(unicode);
3935 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003936}
3937
Alexander Belopolsky40018472011-02-26 01:02:56 +00003938Py_UNICODE *
3939PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942}
3943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944
Alexander Belopolsky40018472011-02-26 01:02:56 +00003945Py_ssize_t
3946PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947{
3948 if (!PyUnicode_Check(unicode)) {
3949 PyErr_BadArgument();
3950 goto onError;
3951 }
3952 return PyUnicode_GET_SIZE(unicode);
3953
Benjamin Peterson29060642009-01-31 22:14:21 +00003954 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 return -1;
3956}
3957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958Py_ssize_t
3959PyUnicode_GetLength(PyObject *unicode)
3960{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003961 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962 PyErr_BadArgument();
3963 return -1;
3964 }
3965
3966 return PyUnicode_GET_LENGTH(unicode);
3967}
3968
3969Py_UCS4
3970PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3971{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003972 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3973 PyErr_BadArgument();
3974 return (Py_UCS4)-1;
3975 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003976 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003977 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 return (Py_UCS4)-1;
3979 }
3980 return PyUnicode_READ_CHAR(unicode, index);
3981}
3982
3983int
3984PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3985{
3986 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003987 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988 return -1;
3989 }
Victor Stinner488fa492011-12-12 00:01:39 +01003990 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003991 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003992 PyErr_SetString(PyExc_IndexError, "string index out of range");
3993 return -1;
3994 }
Victor Stinner488fa492011-12-12 00:01:39 +01003995 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003996 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003997 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3998 PyErr_SetString(PyExc_ValueError, "character out of range");
3999 return -1;
4000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4002 index, ch);
4003 return 0;
4004}
4005
Alexander Belopolsky40018472011-02-26 01:02:56 +00004006const char *
4007PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004008{
Victor Stinner42cb4622010-09-01 19:39:01 +00004009 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004010}
4011
Victor Stinner554f3f02010-06-16 23:33:54 +00004012/* create or adjust a UnicodeDecodeError */
4013static void
4014make_decode_exception(PyObject **exceptionObject,
4015 const char *encoding,
4016 const char *input, Py_ssize_t length,
4017 Py_ssize_t startpos, Py_ssize_t endpos,
4018 const char *reason)
4019{
4020 if (*exceptionObject == NULL) {
4021 *exceptionObject = PyUnicodeDecodeError_Create(
4022 encoding, input, length, startpos, endpos, reason);
4023 }
4024 else {
4025 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4026 goto onError;
4027 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4028 goto onError;
4029 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4030 goto onError;
4031 }
4032 return;
4033
4034onError:
4035 Py_DECREF(*exceptionObject);
4036 *exceptionObject = NULL;
4037}
4038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039/* error handling callback helper:
4040 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004041 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 and adjust various state variables.
4043 return 0 on success, -1 on error
4044*/
4045
Alexander Belopolsky40018472011-02-26 01:02:56 +00004046static int
4047unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004048 const char *encoding, const char *reason,
4049 const char **input, const char **inend, Py_ssize_t *startinpos,
4050 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004051 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004053 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054
4055 PyObject *restuple = NULL;
4056 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004057 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004058 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004059 Py_ssize_t requiredsize;
4060 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004061 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 int res = -1;
4063
Victor Stinner596a6c42011-11-09 00:02:18 +01004064 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4065 outsize = PyUnicode_GET_LENGTH(*output);
4066 else
4067 outsize = _PyUnicode_WSTR_LENGTH(*output);
4068
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 *errorHandler = PyCodec_LookupError(errors);
4071 if (*errorHandler == NULL)
4072 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 }
4074
Victor Stinner554f3f02010-06-16 23:33:54 +00004075 make_decode_exception(exceptionObject,
4076 encoding,
4077 *input, *inend - *input,
4078 *startinpos, *endinpos,
4079 reason);
4080 if (*exceptionObject == NULL)
4081 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004082
4083 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4084 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004085 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004087 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 }
4090 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004092 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004093 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004094
4095 /* Copy back the bytes variables, which might have been modified by the
4096 callback */
4097 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4098 if (!inputobj)
4099 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004100 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004102 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004103 *input = PyBytes_AS_STRING(inputobj);
4104 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004105 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004106 /* we can DECREF safely, as the exception has another reference,
4107 so the object won't go away. */
4108 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004109
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004112 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4114 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004115 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116
Victor Stinner596a6c42011-11-09 00:02:18 +01004117 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4118 /* need more space? (at least enough for what we
4119 have+the replacement+the rest of the string (starting
4120 at the new input position), so we won't have to check space
4121 when there are no errors in the rest of the string) */
4122 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4123 requiredsize = *outpos + replen + insize-newpos;
4124 if (requiredsize > outsize) {
4125 if (requiredsize<2*outsize)
4126 requiredsize = 2*outsize;
4127 if (unicode_resize(output, requiredsize) < 0)
4128 goto onError;
4129 }
4130 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004132 copy_characters(*output, *outpos, repunicode, 0, replen);
4133 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004135 else {
4136 wchar_t *repwstr;
4137 Py_ssize_t repwlen;
4138 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4139 if (repwstr == NULL)
4140 goto onError;
4141 /* need more space? (at least enough for what we
4142 have+the replacement+the rest of the string (starting
4143 at the new input position), so we won't have to check space
4144 when there are no errors in the rest of the string) */
4145 requiredsize = *outpos + repwlen + insize-newpos;
4146 if (requiredsize > outsize) {
4147 if (requiredsize < 2*outsize)
4148 requiredsize = 2*outsize;
4149 if (unicode_resize(output, requiredsize) < 0)
4150 goto onError;
4151 }
4152 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4153 *outpos += repwlen;
4154 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004156 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004157
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 /* we made it! */
4159 res = 0;
4160
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 Py_XDECREF(restuple);
4163 return res;
4164}
4165
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004166/* --- UTF-7 Codec -------------------------------------------------------- */
4167
Antoine Pitrou244651a2009-05-04 18:56:13 +00004168/* See RFC2152 for details. We encode conservatively and decode liberally. */
4169
4170/* Three simple macros defining base-64. */
4171
4172/* Is c a base-64 character? */
4173
4174#define IS_BASE64(c) \
4175 (((c) >= 'A' && (c) <= 'Z') || \
4176 ((c) >= 'a' && (c) <= 'z') || \
4177 ((c) >= '0' && (c) <= '9') || \
4178 (c) == '+' || (c) == '/')
4179
4180/* given that c is a base-64 character, what is its base-64 value? */
4181
4182#define FROM_BASE64(c) \
4183 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4184 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4185 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4186 (c) == '+' ? 62 : 63)
4187
4188/* What is the base-64 character of the bottom 6 bits of n? */
4189
4190#define TO_BASE64(n) \
4191 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4192
4193/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4194 * decoded as itself. We are permissive on decoding; the only ASCII
4195 * byte not decoding to itself is the + which begins a base64
4196 * string. */
4197
4198#define DECODE_DIRECT(c) \
4199 ((c) <= 127 && (c) != '+')
4200
4201/* The UTF-7 encoder treats ASCII characters differently according to
4202 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4203 * the above). See RFC2152. This array identifies these different
4204 * sets:
4205 * 0 : "Set D"
4206 * alphanumeric and '(),-./:?
4207 * 1 : "Set O"
4208 * !"#$%&*;<=>@[]^_`{|}
4209 * 2 : "whitespace"
4210 * ht nl cr sp
4211 * 3 : special (must be base64 encoded)
4212 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4213 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004214
Tim Petersced69f82003-09-16 20:30:58 +00004215static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004216char utf7_category[128] = {
4217/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4218 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4219/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4220 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4221/* sp ! " # $ % & ' ( ) * + , - . / */
4222 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4223/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4224 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4225/* @ A B C D E F G H I J K L M N O */
4226 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4227/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4228 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4229/* ` a b c d e f g h i j k l m n o */
4230 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4231/* p q r s t u v w x y z { | } ~ del */
4232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004233};
4234
Antoine Pitrou244651a2009-05-04 18:56:13 +00004235/* ENCODE_DIRECT: this character should be encoded as itself. The
4236 * answer depends on whether we are encoding set O as itself, and also
4237 * on whether we are encoding whitespace as itself. RFC2152 makes it
4238 * clear that the answers to these questions vary between
4239 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004240
Antoine Pitrou244651a2009-05-04 18:56:13 +00004241#define ENCODE_DIRECT(c, directO, directWS) \
4242 ((c) < 128 && (c) > 0 && \
4243 ((utf7_category[(c)] == 0) || \
4244 (directWS && (utf7_category[(c)] == 2)) || \
4245 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004246
Alexander Belopolsky40018472011-02-26 01:02:56 +00004247PyObject *
4248PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004249 Py_ssize_t size,
4250 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004251{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004252 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4253}
4254
Antoine Pitrou244651a2009-05-04 18:56:13 +00004255/* The decoder. The only state we preserve is our read position,
4256 * i.e. how many characters we have consumed. So if we end in the
4257 * middle of a shift sequence we have to back off the read position
4258 * and the output to the beginning of the sequence, otherwise we lose
4259 * all the shift state (seen bits, number of bits seen, high
4260 * surrogate). */
4261
Alexander Belopolsky40018472011-02-26 01:02:56 +00004262PyObject *
4263PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004264 Py_ssize_t size,
4265 const char *errors,
4266 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004267{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004269 Py_ssize_t startinpos;
4270 Py_ssize_t endinpos;
4271 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004272 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004273 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004274 const char *errmsg = "";
4275 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004276 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004277 unsigned int base64bits = 0;
4278 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004279 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 PyObject *errorHandler = NULL;
4281 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004283 /* Start off assuming it's all ASCII. Widen later as necessary. */
4284 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004285 if (!unicode)
4286 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004287 if (size == 0) {
4288 if (consumed)
4289 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004290 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004291 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004292
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004293 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294 e = s + size;
4295
4296 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004297 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004298 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004299 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004300
Antoine Pitrou244651a2009-05-04 18:56:13 +00004301 if (inShift) { /* in a base-64 section */
4302 if (IS_BASE64(ch)) { /* consume a base-64 character */
4303 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4304 base64bits += 6;
4305 s++;
4306 if (base64bits >= 16) {
4307 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004308 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309 base64bits -= 16;
4310 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4311 if (surrogate) {
4312 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004313 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4314 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004315 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4316 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004317 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004318 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 }
4320 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004321 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4322 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004323 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324 }
4325 }
Victor Stinner551ac952011-11-29 22:58:13 +01004326 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004327 /* first surrogate */
4328 surrogate = outCh;
4329 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004330 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004331 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4332 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004333 }
4334 }
4335 }
4336 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004337 inShift = 0;
4338 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004339 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004340 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4341 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004342 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004344 if (base64bits > 0) { /* left-over bits */
4345 if (base64bits >= 6) {
4346 /* We've seen at least one base-64 character */
4347 errmsg = "partial character in shift sequence";
4348 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004349 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 else {
4351 /* Some bits remain; they should be zero */
4352 if (base64buffer != 0) {
4353 errmsg = "non-zero padding bits in shift sequence";
4354 goto utf7Error;
4355 }
4356 }
4357 }
4358 if (ch != '-') {
4359 /* '-' is absorbed; other terminating
4360 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004361 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4362 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004363 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004364 }
4365 }
4366 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004367 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 s++; /* consume '+' */
4369 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004370 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004371 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4372 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373 }
4374 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004375 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004376 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 }
4379 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004381 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4382 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383 s++;
4384 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 else {
4386 startinpos = s-starts;
4387 s++;
4388 errmsg = "unexpected special character";
4389 goto utf7Error;
4390 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004391 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 endinpos = s-starts;
4394 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 errors, &errorHandler,
4396 "utf7", errmsg,
4397 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004398 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400 }
4401
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402 /* end of string */
4403
4404 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4405 /* if we're in an inconsistent state, that's an error */
4406 if (surrogate ||
4407 (base64bits >= 6) ||
4408 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409 endinpos = size;
4410 if (unicode_decode_call_errorhandler(
4411 errors, &errorHandler,
4412 "utf7", "unterminated shift sequence",
4413 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 goto onError;
4416 if (s < e)
4417 goto restart;
4418 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420
4421 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004422 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004424 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004425 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 }
4427 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004428 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004430 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004432 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004433 goto onError;
4434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 Py_XDECREF(errorHandler);
4436 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004437 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438
Benjamin Peterson29060642009-01-31 22:14:21 +00004439 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 Py_XDECREF(errorHandler);
4441 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442 Py_DECREF(unicode);
4443 return NULL;
4444}
4445
4446
Alexander Belopolsky40018472011-02-26 01:02:56 +00004447PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004448_PyUnicode_EncodeUTF7(PyObject *str,
4449 int base64SetO,
4450 int base64WhiteSpace,
4451 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004452{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004453 int kind;
4454 void *data;
4455 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004456 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004457 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004459 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 unsigned int base64bits = 0;
4461 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 char * out;
4463 char * start;
4464
Benjamin Petersonbac79492012-01-14 13:34:47 -05004465 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004466 return NULL;
4467 kind = PyUnicode_KIND(str);
4468 data = PyUnicode_DATA(str);
4469 len = PyUnicode_GET_LENGTH(str);
4470
4471 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004474 /* It might be possible to tighten this worst case */
4475 allocated = 8 * len;
4476 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004477 return PyErr_NoMemory();
4478
Antoine Pitrou244651a2009-05-04 18:56:13 +00004479 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480 if (v == NULL)
4481 return NULL;
4482
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004483 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004484 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004485 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486
Antoine Pitrou244651a2009-05-04 18:56:13 +00004487 if (inShift) {
4488 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4489 /* shifting out */
4490 if (base64bits) { /* output remaining bits */
4491 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4492 base64buffer = 0;
4493 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494 }
4495 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496 /* Characters not in the BASE64 set implicitly unshift the sequence
4497 so no '-' is required, except if the character is itself a '-' */
4498 if (IS_BASE64(ch) || ch == '-') {
4499 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004500 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 *out++ = (char) ch;
4502 }
4503 else {
4504 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004505 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 else { /* not in a shift sequence */
4508 if (ch == '+') {
4509 *out++ = '+';
4510 *out++ = '-';
4511 }
4512 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4513 *out++ = (char) ch;
4514 }
4515 else {
4516 *out++ = '+';
4517 inShift = 1;
4518 goto encode_char;
4519 }
4520 }
4521 continue;
4522encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004524 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004525
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526 /* code first surrogate */
4527 base64bits += 16;
4528 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4529 while (base64bits >= 6) {
4530 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4531 base64bits -= 6;
4532 }
4533 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004534 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 base64bits += 16;
4537 base64buffer = (base64buffer << 16) | ch;
4538 while (base64bits >= 6) {
4539 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4540 base64bits -= 6;
4541 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004542 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 if (base64bits)
4544 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4545 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004547 if (_PyBytes_Resize(&v, out - start) < 0)
4548 return NULL;
4549 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004551PyObject *
4552PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4553 Py_ssize_t size,
4554 int base64SetO,
4555 int base64WhiteSpace,
4556 const char *errors)
4557{
4558 PyObject *result;
4559 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4560 if (tmp == NULL)
4561 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004562 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004563 base64WhiteSpace, errors);
4564 Py_DECREF(tmp);
4565 return result;
4566}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004567
Antoine Pitrou244651a2009-05-04 18:56:13 +00004568#undef IS_BASE64
4569#undef FROM_BASE64
4570#undef TO_BASE64
4571#undef DECODE_DIRECT
4572#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574/* --- UTF-8 Codec -------------------------------------------------------- */
4575
Tim Petersced69f82003-09-16 20:30:58 +00004576static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004578 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4579 illegal prefix. See RFC 3629 for details */
4580 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4581 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4588 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4592 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4593 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4594 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4595 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596};
4597
Alexander Belopolsky40018472011-02-26 01:02:56 +00004598PyObject *
4599PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004600 Py_ssize_t size,
4601 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602{
Walter Dörwald69652032004-09-07 20:24:22 +00004603 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4604}
4605
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004606#include "stringlib/ucs1lib.h"
4607#include "stringlib/codecs.h"
4608#include "stringlib/undef.h"
4609
4610#include "stringlib/ucs2lib.h"
4611#include "stringlib/codecs.h"
4612#include "stringlib/undef.h"
4613
4614#include "stringlib/ucs4lib.h"
4615#include "stringlib/codecs.h"
4616#include "stringlib/undef.h"
4617
Antoine Pitrouab868312009-01-10 15:40:25 +00004618/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4619#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4620
4621/* Mask to quickly check whether a C 'long' contains a
4622 non-ASCII, UTF8-encoded char. */
4623#if (SIZEOF_LONG == 8)
4624# define ASCII_CHAR_MASK 0x8080808080808080L
4625#elif (SIZEOF_LONG == 4)
4626# define ASCII_CHAR_MASK 0x80808080L
4627#else
4628# error C 'long' size should be either 4 or 8!
4629#endif
4630
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004631/* Scans a UTF-8 string and returns the maximum character to be expected
4632 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004633
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004634 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004635 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004636 */
4637static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004638utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004639{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004640 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004641 const unsigned char *end = p + string_size;
4642 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004643
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004644 assert(unicode_size != NULL);
4645
4646 /* By having a cascade of independent loops which fallback onto each
4647 other, we minimize the amount of work done in the average loop
4648 iteration, and we also maximize the CPU's ability to predict
4649 branches correctly (because a given condition will have always the
4650 same boolean outcome except perhaps in the last iteration of the
4651 corresponding loop).
4652 In the general case this brings us rather close to decoding
4653 performance pre-PEP 393, despite the two-pass decoding.
4654
4655 Note that the pure ASCII loop is not duplicated once a non-ASCII
4656 character has been encountered. It is actually a pessimization (by
4657 a significant factor) to use this loop on text with many non-ASCII
4658 characters, and it is important to avoid bad performance on valid
4659 utf-8 data (invalid utf-8 being a different can of worms).
4660 */
4661
4662 /* ASCII */
4663 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004664 /* Only check value if it's not a ASCII char... */
4665 if (*p < 0x80) {
4666 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4667 an explanation. */
4668 if (!((size_t) p & LONG_PTR_MASK)) {
4669 /* Help register allocation */
4670 register const unsigned char *_p = p;
4671 while (_p < aligned_end) {
4672 unsigned long value = *(unsigned long *) _p;
4673 if (value & ASCII_CHAR_MASK)
4674 break;
4675 _p += SIZEOF_LONG;
4676 char_count += SIZEOF_LONG;
4677 }
4678 p = _p;
4679 if (p == end)
4680 break;
4681 }
4682 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004683 if (*p < 0x80)
4684 ++char_count;
4685 else
4686 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004687 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004688 *unicode_size = char_count;
4689 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004690
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004691_ucs1loop:
4692 for (; p < end; ++p) {
4693 if (*p < 0xc4)
4694 char_count += ((*p & 0xc0) != 0x80);
4695 else
4696 goto _ucs2loop;
4697 }
4698 *unicode_size = char_count;
4699 return 255;
4700
4701_ucs2loop:
4702 for (; p < end; ++p) {
4703 if (*p < 0xf0)
4704 char_count += ((*p & 0xc0) != 0x80);
4705 else
4706 goto _ucs4loop;
4707 }
4708 *unicode_size = char_count;
4709 return 65535;
4710
4711_ucs4loop:
4712 for (; p < end; ++p) {
4713 char_count += ((*p & 0xc0) != 0x80);
4714 }
4715 *unicode_size = char_count;
4716 return 65537;
4717}
4718
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004719/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004720 in case of errors. Implicit parameters: unicode, kind, data, onError.
4721 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004722*/
Victor Stinner785938e2011-12-11 20:09:03 +01004723#define WRITE_MAYBE_FAIL(index, value) \
4724 do { \
4725 Py_ssize_t pos = index; \
4726 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4727 unicode_resize(&unicode, pos + pos/8) < 0) \
4728 goto onError; \
4729 if (unicode_putchar(&unicode, &pos, value) < 0) \
4730 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004731 } while (0)
4732
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004733static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004734decode_utf8_errors(const char *starts,
4735 Py_ssize_t size,
4736 const char *errors,
4737 Py_ssize_t *consumed,
4738 const char *s,
4739 PyObject *unicode,
4740 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004741{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004743 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004744 Py_ssize_t startinpos;
4745 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004746 const char *e = starts + size;
4747 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004748 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 PyObject *errorHandler = NULL;
4750 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004751
Antoine Pitrouab868312009-01-10 15:40:25 +00004752 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753
4754 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004755 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756
4757 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004758 /* Fast path for runs of ASCII characters. Given that common UTF-8
4759 input will consist of an overwhelming majority of ASCII
4760 characters, we try to optimize for this case by checking
4761 as many characters as a C 'long' can contain.
4762 First, check if we can do an aligned read, as most CPUs have
4763 a penalty for unaligned reads.
4764 */
4765 if (!((size_t) s & LONG_PTR_MASK)) {
4766 /* Help register allocation */
4767 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004768 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004769 while (_s < aligned_end) {
4770 /* Read a whole long at a time (either 4 or 8 bytes),
4771 and do a fast unrolled copy if it only contains ASCII
4772 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004773 unsigned long value = *(unsigned long *) _s;
4774 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004775 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004776 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4777 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4778 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4779 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004780#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004781 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4782 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4783 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4784 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004785#endif
4786 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004787 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004788 }
4789 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004790 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004791 if (s == e)
4792 break;
4793 ch = (unsigned char)*s;
4794 }
4795 }
4796
4797 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004798 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 s++;
4800 continue;
4801 }
4802
4803 n = utf8_code_length[ch];
4804
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004805 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004806 if (consumed)
4807 break;
4808 else {
4809 errmsg = "unexpected end of data";
4810 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004811 endinpos = startinpos+1;
4812 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4813 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004814 goto utf8Error;
4815 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004816 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817
4818 switch (n) {
4819
4820 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004821 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004822 startinpos = s-starts;
4823 endinpos = startinpos+1;
4824 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825
4826 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004827 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004828 startinpos = s-starts;
4829 endinpos = startinpos+1;
4830 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831
4832 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004833 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004834 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004836 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004837 goto utf8Error;
4838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004840 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004841 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 break;
4843
4844 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004845 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4846 will result in surrogates in range d800-dfff. Surrogates are
4847 not valid UTF-8 so they are rejected.
4848 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4849 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004850 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004851 (s[2] & 0xc0) != 0x80 ||
4852 ((unsigned char)s[0] == 0xE0 &&
4853 (unsigned char)s[1] < 0xA0) ||
4854 ((unsigned char)s[0] == 0xED &&
4855 (unsigned char)s[1] > 0x9F)) {
4856 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004858 endinpos = startinpos + 1;
4859
4860 /* if s[1] first two bits are 1 and 0, then the invalid
4861 continuation byte is s[2], so increment endinpos by 1,
4862 if not, s[1] is invalid and endinpos doesn't need to
4863 be incremented. */
4864 if ((s[1] & 0xC0) == 0x80)
4865 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 goto utf8Error;
4867 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004869 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004870 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004871 break;
4872
4873 case 4:
4874 if ((s[1] & 0xc0) != 0x80 ||
4875 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004876 (s[3] & 0xc0) != 0x80 ||
4877 ((unsigned char)s[0] == 0xF0 &&
4878 (unsigned char)s[1] < 0x90) ||
4879 ((unsigned char)s[0] == 0xF4 &&
4880 (unsigned char)s[1] > 0x8F)) {
4881 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004882 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004883 endinpos = startinpos + 1;
4884 if ((s[1] & 0xC0) == 0x80) {
4885 endinpos++;
4886 if ((s[2] & 0xC0) == 0x80)
4887 endinpos++;
4888 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004889 goto utf8Error;
4890 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004891 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004892 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004893 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004894
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004895 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 }
4898 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004900
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 if (unicode_decode_call_errorhandler(
4903 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004904 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004905 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004906 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004908 /* Update data because unicode_decode_call_errorhandler might have
4909 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 }
Walter Dörwald69652032004-09-07 20:24:22 +00004912 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004915 /* Adjust length and ready string when it contained errors and
4916 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004917 if (unicode_resize(&unicode, i) < 0)
4918 goto onError;
4919 unicode_adjust_maxchar(&unicode);
4920 if (unicode == NULL)
4921 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 Py_XDECREF(errorHandler);
4924 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004925 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004926 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929 Py_XDECREF(errorHandler);
4930 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004931 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 return NULL;
4933}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004934#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004935
Victor Stinner785938e2011-12-11 20:09:03 +01004936PyObject *
4937PyUnicode_DecodeUTF8Stateful(const char *s,
4938 Py_ssize_t size,
4939 const char *errors,
4940 Py_ssize_t *consumed)
4941{
4942 Py_UCS4 maxchar = 0;
4943 Py_ssize_t unicode_size;
4944 int has_errors = 0;
4945 PyObject *unicode;
4946 int kind;
4947 void *data;
4948 const char *starts = s;
4949 const char *e;
4950 Py_ssize_t i;
4951
4952 if (size == 0) {
4953 if (consumed)
4954 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004955 Py_INCREF(unicode_empty);
4956 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004957 }
4958
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004959 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004960
4961 /* When the string is ASCII only, just use memcpy and return.
4962 unicode_size may be != size if there is an incomplete UTF-8
4963 sequence at the end of the ASCII block. */
4964 if (maxchar < 128 && size == unicode_size) {
4965 if (consumed)
4966 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004967 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004968 }
4969
4970 unicode = PyUnicode_New(unicode_size, maxchar);
4971 if (!unicode)
4972 return NULL;
4973 kind = PyUnicode_KIND(unicode);
4974 data = PyUnicode_DATA(unicode);
4975
4976 /* Unpack UTF-8 encoded data */
4977 i = 0;
4978 e = starts + size;
4979 switch (kind) {
4980 case PyUnicode_1BYTE_KIND:
4981 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4982 break;
4983 case PyUnicode_2BYTE_KIND:
4984 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4985 break;
4986 case PyUnicode_4BYTE_KIND:
4987 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4988 break;
4989 }
4990 if (!has_errors) {
4991 /* Ensure the unicode size calculation was correct */
4992 assert(i == unicode_size);
4993 assert(s == e);
4994 if (consumed)
4995 *consumed = size;
4996 return unicode;
4997 }
4998
4999 /* In case of errors, maxchar and size computation might be incorrect;
5000 code below refits and resizes as necessary. */
5001 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
5002}
5003
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005004#ifdef __APPLE__
5005
5006/* Simplified UTF-8 decoder using surrogateescape error handler,
5007 used to decode the command line arguments on Mac OS X. */
5008
5009wchar_t*
5010_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5011{
5012 int n;
5013 const char *e;
5014 wchar_t *unicode, *p;
5015
5016 /* Note: size will always be longer than the resulting Unicode
5017 character count */
5018 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
5019 PyErr_NoMemory();
5020 return NULL;
5021 }
5022 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
5023 if (!unicode)
5024 return NULL;
5025
5026 /* Unpack UTF-8 encoded data */
5027 p = unicode;
5028 e = s + size;
5029 while (s < e) {
5030 Py_UCS4 ch = (unsigned char)*s;
5031
5032 if (ch < 0x80) {
5033 *p++ = (wchar_t)ch;
5034 s++;
5035 continue;
5036 }
5037
5038 n = utf8_code_length[ch];
5039 if (s + n > e) {
5040 goto surrogateescape;
5041 }
5042
5043 switch (n) {
5044 case 0:
5045 case 1:
5046 goto surrogateescape;
5047
5048 case 2:
5049 if ((s[1] & 0xc0) != 0x80)
5050 goto surrogateescape;
5051 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5052 assert ((ch > 0x007F) && (ch <= 0x07FF));
5053 *p++ = (wchar_t)ch;
5054 break;
5055
5056 case 3:
5057 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5058 will result in surrogates in range d800-dfff. Surrogates are
5059 not valid UTF-8 so they are rejected.
5060 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5061 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5062 if ((s[1] & 0xc0) != 0x80 ||
5063 (s[2] & 0xc0) != 0x80 ||
5064 ((unsigned char)s[0] == 0xE0 &&
5065 (unsigned char)s[1] < 0xA0) ||
5066 ((unsigned char)s[0] == 0xED &&
5067 (unsigned char)s[1] > 0x9F)) {
5068
5069 goto surrogateescape;
5070 }
5071 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5072 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005073 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005074 break;
5075
5076 case 4:
5077 if ((s[1] & 0xc0) != 0x80 ||
5078 (s[2] & 0xc0) != 0x80 ||
5079 (s[3] & 0xc0) != 0x80 ||
5080 ((unsigned char)s[0] == 0xF0 &&
5081 (unsigned char)s[1] < 0x90) ||
5082 ((unsigned char)s[0] == 0xF4 &&
5083 (unsigned char)s[1] > 0x8F)) {
5084 goto surrogateescape;
5085 }
5086 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5087 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005088 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005089
5090#if SIZEOF_WCHAR_T == 4
5091 *p++ = (wchar_t)ch;
5092#else
5093 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005094 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5095 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005096#endif
5097 break;
5098 }
5099 s += n;
5100 continue;
5101
5102 surrogateescape:
5103 *p++ = 0xDC00 + ch;
5104 s++;
5105 }
5106 *p = L'\0';
5107 return unicode;
5108}
5109
5110#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005112/* Primary internal function which creates utf8 encoded bytes objects.
5113
5114 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005115 and allocate exactly as much space needed at the end. Else allocate the
5116 maximum possible needed (4 result bytes per Unicode character), and return
5117 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005118*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005119PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005120_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121{
Victor Stinner6099a032011-12-18 14:22:26 +01005122 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005123 void *data;
5124 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005126 if (!PyUnicode_Check(unicode)) {
5127 PyErr_BadArgument();
5128 return NULL;
5129 }
5130
5131 if (PyUnicode_READY(unicode) == -1)
5132 return NULL;
5133
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005134 if (PyUnicode_UTF8(unicode))
5135 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5136 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005137
5138 kind = PyUnicode_KIND(unicode);
5139 data = PyUnicode_DATA(unicode);
5140 size = PyUnicode_GET_LENGTH(unicode);
5141
Benjamin Petersonead6b532011-12-20 17:23:42 -06005142 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005143 default:
5144 assert(0);
5145 case PyUnicode_1BYTE_KIND:
5146 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5147 assert(!PyUnicode_IS_ASCII(unicode));
5148 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5149 case PyUnicode_2BYTE_KIND:
5150 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5151 case PyUnicode_4BYTE_KIND:
5152 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154}
5155
Alexander Belopolsky40018472011-02-26 01:02:56 +00005156PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005157PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5158 Py_ssize_t size,
5159 const char *errors)
5160{
5161 PyObject *v, *unicode;
5162
5163 unicode = PyUnicode_FromUnicode(s, size);
5164 if (unicode == NULL)
5165 return NULL;
5166 v = _PyUnicode_AsUTF8String(unicode, errors);
5167 Py_DECREF(unicode);
5168 return v;
5169}
5170
5171PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005172PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005174 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175}
5176
Walter Dörwald41980ca2007-08-16 21:55:45 +00005177/* --- UTF-32 Codec ------------------------------------------------------- */
5178
5179PyObject *
5180PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 Py_ssize_t size,
5182 const char *errors,
5183 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005184{
5185 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5186}
5187
5188PyObject *
5189PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005190 Py_ssize_t size,
5191 const char *errors,
5192 int *byteorder,
5193 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005194{
5195 const char *starts = s;
5196 Py_ssize_t startinpos;
5197 Py_ssize_t endinpos;
5198 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005199 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005200 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005201 int bo = 0; /* assume native ordering by default */
5202 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005203 /* Offsets from q for retrieving bytes in the right order. */
5204#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5205 int iorder[] = {0, 1, 2, 3};
5206#else
5207 int iorder[] = {3, 2, 1, 0};
5208#endif
5209 PyObject *errorHandler = NULL;
5210 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005211
Walter Dörwald41980ca2007-08-16 21:55:45 +00005212 q = (unsigned char *)s;
5213 e = q + size;
5214
5215 if (byteorder)
5216 bo = *byteorder;
5217
5218 /* Check for BOM marks (U+FEFF) in the input and adjust current
5219 byte order setting accordingly. In native mode, the leading BOM
5220 mark is skipped, in all other modes, it is copied to the output
5221 stream as-is (giving a ZWNBSP character). */
5222 if (bo == 0) {
5223 if (size >= 4) {
5224 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005225 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 if (bom == 0x0000FEFF) {
5228 q += 4;
5229 bo = -1;
5230 }
5231 else if (bom == 0xFFFE0000) {
5232 q += 4;
5233 bo = 1;
5234 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005235#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 if (bom == 0x0000FEFF) {
5237 q += 4;
5238 bo = 1;
5239 }
5240 else if (bom == 0xFFFE0000) {
5241 q += 4;
5242 bo = -1;
5243 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005246 }
5247
5248 if (bo == -1) {
5249 /* force LE */
5250 iorder[0] = 0;
5251 iorder[1] = 1;
5252 iorder[2] = 2;
5253 iorder[3] = 3;
5254 }
5255 else if (bo == 1) {
5256 /* force BE */
5257 iorder[0] = 3;
5258 iorder[1] = 2;
5259 iorder[2] = 1;
5260 iorder[3] = 0;
5261 }
5262
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005263 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005264 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005265 if (!unicode)
5266 return NULL;
5267 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005268 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005269 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005270
Walter Dörwald41980ca2007-08-16 21:55:45 +00005271 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 Py_UCS4 ch;
5273 /* remaining bytes at the end? (size should be divisible by 4) */
5274 if (e-q<4) {
5275 if (consumed)
5276 break;
5277 errmsg = "truncated data";
5278 startinpos = ((const char *)q)-starts;
5279 endinpos = ((const char *)e)-starts;
5280 goto utf32Error;
5281 /* The remaining input chars are ignored if the callback
5282 chooses to skip the input */
5283 }
5284 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5285 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005286
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 if (ch >= 0x110000)
5288 {
5289 errmsg = "codepoint not in range(0x110000)";
5290 startinpos = ((const char *)q)-starts;
5291 endinpos = startinpos+4;
5292 goto utf32Error;
5293 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005294 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5295 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 q += 4;
5297 continue;
5298 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 if (unicode_decode_call_errorhandler(
5300 errors, &errorHandler,
5301 "utf32", errmsg,
5302 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005303 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005304 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005305 }
5306
5307 if (byteorder)
5308 *byteorder = bo;
5309
5310 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005312
5313 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005314 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005315 goto onError;
5316
5317 Py_XDECREF(errorHandler);
5318 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005319 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005320
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005322 Py_DECREF(unicode);
5323 Py_XDECREF(errorHandler);
5324 Py_XDECREF(exc);
5325 return NULL;
5326}
5327
5328PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005329_PyUnicode_EncodeUTF32(PyObject *str,
5330 const char *errors,
5331 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005332{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005333 int kind;
5334 void *data;
5335 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005336 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005337 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005338 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005339 /* Offsets from p for storing byte pairs in the right order. */
5340#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5341 int iorder[] = {0, 1, 2, 3};
5342#else
5343 int iorder[] = {3, 2, 1, 0};
5344#endif
5345
Benjamin Peterson29060642009-01-31 22:14:21 +00005346#define STORECHAR(CH) \
5347 do { \
5348 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5349 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5350 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5351 p[iorder[0]] = (CH) & 0xff; \
5352 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005353 } while(0)
5354
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005355 if (!PyUnicode_Check(str)) {
5356 PyErr_BadArgument();
5357 return NULL;
5358 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005359 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005360 return NULL;
5361 kind = PyUnicode_KIND(str);
5362 data = PyUnicode_DATA(str);
5363 len = PyUnicode_GET_LENGTH(str);
5364
5365 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005366 bytesize = nsize * 4;
5367 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005369 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370 if (v == NULL)
5371 return NULL;
5372
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005373 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005376 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005377 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005378
5379 if (byteorder == -1) {
5380 /* force LE */
5381 iorder[0] = 0;
5382 iorder[1] = 1;
5383 iorder[2] = 2;
5384 iorder[3] = 3;
5385 }
5386 else if (byteorder == 1) {
5387 /* force BE */
5388 iorder[0] = 3;
5389 iorder[1] = 2;
5390 iorder[2] = 1;
5391 iorder[3] = 0;
5392 }
5393
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005394 for (i = 0; i < len; i++)
5395 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005396
5397 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005398 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399#undef STORECHAR
5400}
5401
Alexander Belopolsky40018472011-02-26 01:02:56 +00005402PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005403PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5404 Py_ssize_t size,
5405 const char *errors,
5406 int byteorder)
5407{
5408 PyObject *result;
5409 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5410 if (tmp == NULL)
5411 return NULL;
5412 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5413 Py_DECREF(tmp);
5414 return result;
5415}
5416
5417PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005418PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419{
Victor Stinnerb960b342011-11-20 19:12:52 +01005420 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005421}
5422
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423/* --- UTF-16 Codec ------------------------------------------------------- */
5424
Tim Peters772747b2001-08-09 22:21:55 +00005425PyObject *
5426PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 Py_ssize_t size,
5428 const char *errors,
5429 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430{
Walter Dörwald69652032004-09-07 20:24:22 +00005431 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5432}
5433
Antoine Pitrouab868312009-01-10 15:40:25 +00005434/* Two masks for fast checking of whether a C 'long' may contain
5435 UTF16-encoded surrogate characters. This is an efficient heuristic,
5436 assuming that non-surrogate characters with a code point >= 0x8000 are
5437 rare in most input.
5438 FAST_CHAR_MASK is used when the input is in native byte ordering,
5439 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005440*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005441#if (SIZEOF_LONG == 8)
5442# define FAST_CHAR_MASK 0x8000800080008000L
5443# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005444# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005445#elif (SIZEOF_LONG == 4)
5446# define FAST_CHAR_MASK 0x80008000L
5447# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005448# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005449#else
5450# error C 'long' size should be either 4 or 8!
5451#endif
5452
Walter Dörwald69652032004-09-07 20:24:22 +00005453PyObject *
5454PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 Py_ssize_t size,
5456 const char *errors,
5457 int *byteorder,
5458 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005459{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005460 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005461 Py_ssize_t startinpos;
5462 Py_ssize_t endinpos;
5463 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005464 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005465 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005466 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005467 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005468 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005469 /* Offsets from q for retrieving byte pairs in the right order. */
5470#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5471 int ihi = 1, ilo = 0;
5472#else
5473 int ihi = 0, ilo = 1;
5474#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475 PyObject *errorHandler = NULL;
5476 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477
5478 /* Note: size will always be longer than the resulting Unicode
5479 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005480 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 if (!unicode)
5482 return NULL;
5483 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005484 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005485 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486
Tim Peters772747b2001-08-09 22:21:55 +00005487 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005488 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
5490 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005491 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005493 /* Check for BOM marks (U+FEFF) in the input and adjust current
5494 byte order setting accordingly. In native mode, the leading BOM
5495 mark is skipped, in all other modes, it is copied to the output
5496 stream as-is (giving a ZWNBSP character). */
5497 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005498 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005499 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005500#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 if (bom == 0xFEFF) {
5502 q += 2;
5503 bo = -1;
5504 }
5505 else if (bom == 0xFFFE) {
5506 q += 2;
5507 bo = 1;
5508 }
Tim Petersced69f82003-09-16 20:30:58 +00005509#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 if (bom == 0xFEFF) {
5511 q += 2;
5512 bo = 1;
5513 }
5514 else if (bom == 0xFFFE) {
5515 q += 2;
5516 bo = -1;
5517 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005518#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521
Tim Peters772747b2001-08-09 22:21:55 +00005522 if (bo == -1) {
5523 /* force LE */
5524 ihi = 1;
5525 ilo = 0;
5526 }
5527 else if (bo == 1) {
5528 /* force BE */
5529 ihi = 0;
5530 ilo = 1;
5531 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005532#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5533 native_ordering = ilo < ihi;
5534#else
5535 native_ordering = ilo > ihi;
5536#endif
Tim Peters772747b2001-08-09 22:21:55 +00005537
Antoine Pitrouab868312009-01-10 15:40:25 +00005538 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005539 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005540 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005541 /* First check for possible aligned read of a C 'long'. Unaligned
5542 reads are more expensive, better to defer to another iteration. */
5543 if (!((size_t) q & LONG_PTR_MASK)) {
5544 /* Fast path for runs of non-surrogate chars. */
5545 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005546 int kind = PyUnicode_KIND(unicode);
5547 void *data = PyUnicode_DATA(unicode);
5548 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005549 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005550 Py_UCS4 maxch;
5551 if (native_ordering) {
5552 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005553 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005554 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005555 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005556 else {
5557 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005558 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005559 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005560 block = ((block >> 8) & STRIPPED_MASK) |
5561 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005562 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005563 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005564#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005565 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
5566 maxch = Py_MAX(maxch, ch);
5567 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
5568 maxch = Py_MAX(maxch, ch);
5569 ch = (Py_UCS2)(block >> 48);
5570 maxch = Py_MAX(maxch, ch);
5571#else
5572 ch = (Py_UCS2)(block >> 16);
5573 maxch = Py_MAX(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005574#endif
5575 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5576 if (unicode_widen(&unicode, maxch) < 0)
5577 goto onError;
5578 kind = PyUnicode_KIND(unicode);
5579 data = PyUnicode_DATA(unicode);
5580 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005581#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5582 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005583#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005584 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5585 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5586 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5587#else
5588 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5589#endif
5590#else
5591#if SIZEOF_LONG == 8
5592 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5593 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5594 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5595#else
5596 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5597#endif
5598 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005599#endif
5600 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005601 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005602 q = _q;
5603 if (q >= e)
5604 break;
5605 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607
Benjamin Peterson14339b62009-01-31 16:36:08 +00005608 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005609
Victor Stinner551ac952011-11-29 22:58:13 +01005610 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005611 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5612 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 continue;
5614 }
5615
5616 /* UTF-16 code pair: */
5617 if (q > e) {
5618 errmsg = "unexpected end of data";
5619 startinpos = (((const char *)q) - 2) - starts;
5620 endinpos = ((const char *)e) + 1 - starts;
5621 goto utf16Error;
5622 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005623 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5624 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005626 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005627 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005628 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005629 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 continue;
5631 }
5632 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005633 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 startinpos = (((const char *)q)-4)-starts;
5635 endinpos = startinpos+2;
5636 goto utf16Error;
5637 }
5638
Benjamin Peterson14339b62009-01-31 16:36:08 +00005639 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 errmsg = "illegal encoding";
5641 startinpos = (((const char *)q)-2)-starts;
5642 endinpos = startinpos+2;
5643 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005644
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005647 errors,
5648 &errorHandler,
5649 "utf16", errmsg,
5650 &starts,
5651 (const char **)&e,
5652 &startinpos,
5653 &endinpos,
5654 &exc,
5655 (const char **)&q,
5656 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005657 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005660 /* remaining byte at the end? (size should be even) */
5661 if (e == q) {
5662 if (!consumed) {
5663 errmsg = "truncated data";
5664 startinpos = ((const char *)q) - starts;
5665 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005666 if (unicode_decode_call_errorhandler(
5667 errors,
5668 &errorHandler,
5669 "utf16", errmsg,
5670 &starts,
5671 (const char **)&e,
5672 &startinpos,
5673 &endinpos,
5674 &exc,
5675 (const char **)&q,
5676 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005677 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005678 goto onError;
5679 /* The remaining input chars are ignored if the callback
5680 chooses to skip the input */
5681 }
5682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683
5684 if (byteorder)
5685 *byteorder = bo;
5686
Walter Dörwald69652032004-09-07 20:24:22 +00005687 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005689
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005691 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 goto onError;
5693
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005694 Py_XDECREF(errorHandler);
5695 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005696 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005700 Py_XDECREF(errorHandler);
5701 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 return NULL;
5703}
5704
Antoine Pitrouab868312009-01-10 15:40:25 +00005705#undef FAST_CHAR_MASK
5706#undef SWAPPED_FAST_CHAR_MASK
5707
Tim Peters772747b2001-08-09 22:21:55 +00005708PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005709_PyUnicode_EncodeUTF16(PyObject *str,
5710 const char *errors,
5711 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005713 int kind;
5714 void *data;
5715 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005716 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005717 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005718 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005719 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005720 /* Offsets from p for storing byte pairs in the right order. */
5721#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5722 int ihi = 1, ilo = 0;
5723#else
5724 int ihi = 0, ilo = 1;
5725#endif
5726
Benjamin Peterson29060642009-01-31 22:14:21 +00005727#define STORECHAR(CH) \
5728 do { \
5729 p[ihi] = ((CH) >> 8) & 0xff; \
5730 p[ilo] = (CH) & 0xff; \
5731 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005732 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005734 if (!PyUnicode_Check(str)) {
5735 PyErr_BadArgument();
5736 return NULL;
5737 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005738 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005739 return NULL;
5740 kind = PyUnicode_KIND(str);
5741 data = PyUnicode_DATA(str);
5742 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005743
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005744 pairs = 0;
5745 if (kind == PyUnicode_4BYTE_KIND)
5746 for (i = 0; i < len; i++)
5747 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5748 pairs++;
5749 /* 2 * (len + pairs + (byteorder == 0)) */
5750 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005752 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005753 bytesize = nsize * 2;
5754 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005756 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 if (v == NULL)
5758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005760 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005763 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005764 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005765
5766 if (byteorder == -1) {
5767 /* force LE */
5768 ihi = 1;
5769 ilo = 0;
5770 }
5771 else if (byteorder == 1) {
5772 /* force BE */
5773 ihi = 0;
5774 ilo = 1;
5775 }
5776
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005777 for (i = 0; i < len; i++) {
5778 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5779 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005781 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5782 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 }
Tim Peters772747b2001-08-09 22:21:55 +00005784 STORECHAR(ch);
5785 if (ch2)
5786 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005787 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005788
5789 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005790 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005791#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792}
5793
Alexander Belopolsky40018472011-02-26 01:02:56 +00005794PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005795PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5796 Py_ssize_t size,
5797 const char *errors,
5798 int byteorder)
5799{
5800 PyObject *result;
5801 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5802 if (tmp == NULL)
5803 return NULL;
5804 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5805 Py_DECREF(tmp);
5806 return result;
5807}
5808
5809PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005810PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005812 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813}
5814
5815/* --- Unicode Escape Codec ----------------------------------------------- */
5816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005817/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5818 if all the escapes in the string make it still a valid ASCII string.
5819 Returns -1 if any escapes were found which cause the string to
5820 pop out of ASCII range. Otherwise returns the length of the
5821 required buffer to hold the string.
5822 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005823static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5825{
5826 const unsigned char *p = (const unsigned char *)s;
5827 const unsigned char *end = p + size;
5828 Py_ssize_t length = 0;
5829
5830 if (size < 0)
5831 return -1;
5832
5833 for (; p < end; ++p) {
5834 if (*p > 127) {
5835 /* Non-ASCII */
5836 return -1;
5837 }
5838 else if (*p != '\\') {
5839 /* Normal character */
5840 ++length;
5841 }
5842 else {
5843 /* Backslash-escape, check next char */
5844 ++p;
5845 /* Escape sequence reaches till end of string or
5846 non-ASCII follow-up. */
5847 if (p >= end || *p > 127)
5848 return -1;
5849 switch (*p) {
5850 case '\n':
5851 /* backslash + \n result in zero characters */
5852 break;
5853 case '\\': case '\'': case '\"':
5854 case 'b': case 'f': case 't':
5855 case 'n': case 'r': case 'v': case 'a':
5856 ++length;
5857 break;
5858 case '0': case '1': case '2': case '3':
5859 case '4': case '5': case '6': case '7':
5860 case 'x': case 'u': case 'U': case 'N':
5861 /* these do not guarantee ASCII characters */
5862 return -1;
5863 default:
5864 /* count the backslash + the other character */
5865 length += 2;
5866 }
5867 }
5868 }
5869 return length;
5870}
5871
Fredrik Lundh06d12682001-01-24 07:59:11 +00005872static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005873
Alexander Belopolsky40018472011-02-26 01:02:56 +00005874PyObject *
5875PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005876 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005877 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005880 Py_ssize_t startinpos;
5881 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005882 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005883 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005885 char* message;
5886 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005887 PyObject *errorHandler = NULL;
5888 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005889 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005890 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005891
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005892 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005893
5894 /* After length_of_escaped_ascii_string() there are two alternatives,
5895 either the string is pure ASCII with named escapes like \n, etc.
5896 and we determined it's exact size (common case)
5897 or it contains \x, \u, ... escape sequences. then we create a
5898 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005899 if (len >= 0) {
5900 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005901 if (!v)
5902 goto onError;
5903 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005904 }
5905 else {
5906 /* Escaped strings will always be longer than the resulting
5907 Unicode string, so we start with size here and then reduce the
5908 length after conversion to the true value.
5909 (but if the error callback returns a long replacement string
5910 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005911 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005912 if (!v)
5913 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005914 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005915 }
5916
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005918 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005919 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005921
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 while (s < end) {
5923 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005924 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005927 /* The only case in which i == ascii_length is a backslash
5928 followed by a newline. */
5929 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005930
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 /* Non-escape characters are interpreted as Unicode ordinals */
5932 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005933 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5934 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 continue;
5936 }
5937
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 /* \ - Escapes */
5940 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005941 c = *s++;
5942 if (s > end)
5943 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005944
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005945 /* The only case in which i == ascii_length is a backslash
5946 followed by a newline. */
5947 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005948
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005949 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005952#define WRITECHAR(ch) \
5953 do { \
5954 if (unicode_putchar(&v, &i, ch) < 0) \
5955 goto onError; \
5956 }while(0)
5957
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005959 case '\\': WRITECHAR('\\'); break;
5960 case '\'': WRITECHAR('\''); break;
5961 case '\"': WRITECHAR('\"'); break;
5962 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005963 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005964 case 'f': WRITECHAR('\014'); break;
5965 case 't': WRITECHAR('\t'); break;
5966 case 'n': WRITECHAR('\n'); break;
5967 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005968 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005969 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005970 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005971 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 case '0': case '1': case '2': case '3':
5975 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005976 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005977 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005978 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005979 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005980 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005982 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 break;
5984
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 /* hex escapes */
5986 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005988 digits = 2;
5989 message = "truncated \\xXX escape";
5990 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005994 digits = 4;
5995 message = "truncated \\uXXXX escape";
5996 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005999 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006000 digits = 8;
6001 message = "truncated \\UXXXXXXXX escape";
6002 hexescape:
6003 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 if (s+digits>end) {
6005 endinpos = size;
6006 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 errors, &errorHandler,
6008 "unicodeescape", "end of string in escape sequence",
6009 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006010 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 goto onError;
6012 goto nextByte;
6013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006014 for (j = 0; j < digits; ++j) {
6015 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00006016 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006017 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 errors, &errorHandler,
6020 "unicodeescape", message,
6021 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006022 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006023 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006024 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006025 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006026 }
6027 chr = (chr<<4) & ~0xF;
6028 if (c >= '0' && c <= '9')
6029 chr += c - '0';
6030 else if (c >= 'a' && c <= 'f')
6031 chr += 10 + c - 'a';
6032 else
6033 chr += 10 + c - 'A';
6034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006035 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006036 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037 /* _decoding_error will have already written into the
6038 target buffer. */
6039 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006040 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006041 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01006042 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006043 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00006044 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 errors, &errorHandler,
6048 "unicodeescape", "illegal Unicode character",
6049 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006050 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006051 goto onError;
6052 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006053 break;
6054
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006056 case 'N':
6057 message = "malformed \\N character escape";
6058 if (ucnhash_CAPI == NULL) {
6059 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006060 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6061 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062 if (ucnhash_CAPI == NULL)
6063 goto ucnhashError;
6064 }
6065 if (*s == '{') {
6066 const char *start = s+1;
6067 /* look for the closing brace */
6068 while (*s != '}' && s < end)
6069 s++;
6070 if (s > start && s < end && *s == '}') {
6071 /* found a name. look it up in the unicode database */
6072 message = "unknown Unicode character name";
6073 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006074 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006075 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006076 goto store;
6077 }
6078 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 errors, &errorHandler,
6082 "unicodeescape", message,
6083 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006084 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006085 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006086 break;
6087
6088 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006089 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006090 message = "\\ at end of string";
6091 s--;
6092 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 errors, &errorHandler,
6095 "unicodeescape", message,
6096 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006097 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006098 goto onError;
6099 }
6100 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006101 WRITECHAR('\\');
6102 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006103 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006104 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006107 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006109#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006110
Victor Stinner16e6a802011-12-12 13:24:15 +01006111 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006112 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006113 Py_XDECREF(errorHandler);
6114 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006115 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006116
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006118 PyErr_SetString(
6119 PyExc_UnicodeError,
6120 "\\N escapes not supported (can't load unicodedata module)"
6121 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006122 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123 Py_XDECREF(errorHandler);
6124 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006125 return NULL;
6126
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 Py_XDECREF(errorHandler);
6130 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 return NULL;
6132}
6133
6134/* Return a Unicode-Escape string version of the Unicode object.
6135
6136 If quotes is true, the string is enclosed in u"" or u'' quotes as
6137 appropriate.
6138
6139*/
6140
Alexander Belopolsky40018472011-02-26 01:02:56 +00006141PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006144 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006145 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006147 int kind;
6148 void *data;
6149 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
Thomas Wouters89f507f2006-12-13 04:49:30 +00006151 /* Initial allocation is based on the longest-possible unichr
6152 escape.
6153
6154 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6155 unichr, so in this case it's the longest unichr escape. In
6156 narrow (UTF-16) builds this is five chars per source unichr
6157 since there are two unichrs in the surrogate pair, so in narrow
6158 (UTF-16) builds it's not the longest unichr escape.
6159
6160 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6161 so in the narrow (UTF-16) build case it's the longest unichr
6162 escape.
6163 */
6164
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 if (!PyUnicode_Check(unicode)) {
6166 PyErr_BadArgument();
6167 return NULL;
6168 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006169 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 return NULL;
6171 len = PyUnicode_GET_LENGTH(unicode);
6172 kind = PyUnicode_KIND(unicode);
6173 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006174 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6176 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6177 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6178 }
6179
6180 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006181 return PyBytes_FromStringAndSize(NULL, 0);
6182
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006185
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006186 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 if (repr == NULL)
6191 return NULL;
6192
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006193 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006196 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006197
Walter Dörwald79e913e2007-05-12 11:08:06 +00006198 /* Escape backslashes */
6199 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 *p++ = '\\';
6201 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006202 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006203 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006204
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006205 /* Map 21-bit characters to '\U00xxxxxx' */
6206 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006207 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006208 *p++ = '\\';
6209 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006210 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6211 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6212 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6213 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6214 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6215 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6216 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6217 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006219 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006220
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006222 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 *p++ = '\\';
6224 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006225 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6226 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6227 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6228 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006230
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006231 /* Map special whitespace to '\t', \n', '\r' */
6232 else if (ch == '\t') {
6233 *p++ = '\\';
6234 *p++ = 't';
6235 }
6236 else if (ch == '\n') {
6237 *p++ = '\\';
6238 *p++ = 'n';
6239 }
6240 else if (ch == '\r') {
6241 *p++ = '\\';
6242 *p++ = 'r';
6243 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006244
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006245 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006246 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006248 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006249 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6250 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006251 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006252
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253 /* Copy everything else as-is */
6254 else
6255 *p++ = (char) ch;
6256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006258 assert(p - PyBytes_AS_STRING(repr) > 0);
6259 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6260 return NULL;
6261 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262}
6263
Alexander Belopolsky40018472011-02-26 01:02:56 +00006264PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006265PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6266 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268 PyObject *result;
6269 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6270 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006272 result = PyUnicode_AsUnicodeEscapeString(tmp);
6273 Py_DECREF(tmp);
6274 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275}
6276
6277/* --- Raw Unicode Escape Codec ------------------------------------------- */
6278
Alexander Belopolsky40018472011-02-26 01:02:56 +00006279PyObject *
6280PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006281 Py_ssize_t size,
6282 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006284 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006285 Py_ssize_t startinpos;
6286 Py_ssize_t endinpos;
6287 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006288 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 const char *end;
6290 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291 PyObject *errorHandler = NULL;
6292 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006293
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 /* Escaped strings will always be longer than the resulting
6295 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 length after conversion to the true value. (But decoding error
6297 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006298 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006302 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006303 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 end = s + size;
6305 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 unsigned char c;
6307 Py_UCS4 x;
6308 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006309 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 /* Non-escape characters are interpreted as Unicode ordinals */
6312 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006313 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6314 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006316 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 startinpos = s-starts;
6318
6319 /* \u-escapes are only interpreted iff the number of leading
6320 backslashes if odd */
6321 bs = s;
6322 for (;s < end;) {
6323 if (*s != '\\')
6324 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006325 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6326 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 }
6328 if (((s - bs) & 1) == 0 ||
6329 s >= end ||
6330 (*s != 'u' && *s != 'U')) {
6331 continue;
6332 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006333 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 count = *s=='u' ? 4 : 8;
6335 s++;
6336
6337 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 for (x = 0, i = 0; i < count; ++i, ++s) {
6339 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006340 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 endinpos = s-starts;
6342 if (unicode_decode_call_errorhandler(
6343 errors, &errorHandler,
6344 "rawunicodeescape", "truncated \\uXXXX",
6345 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006346 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 goto onError;
6348 goto nextByte;
6349 }
6350 x = (x<<4) & ~0xF;
6351 if (c >= '0' && c <= '9')
6352 x += c - '0';
6353 else if (c >= 'a' && c <= 'f')
6354 x += 10 + c - 'a';
6355 else
6356 x += 10 + c - 'A';
6357 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006358 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006359 if (unicode_putchar(&v, &outpos, x) < 0)
6360 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006361 } else {
6362 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006363 if (unicode_decode_call_errorhandler(
6364 errors, &errorHandler,
6365 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006367 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006369 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 nextByte:
6371 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006373 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375 Py_XDECREF(errorHandler);
6376 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006377 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006378
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381 Py_XDECREF(errorHandler);
6382 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 return NULL;
6384}
6385
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006386
Alexander Belopolsky40018472011-02-26 01:02:56 +00006387PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006388PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006390 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 char *p;
6392 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006393 Py_ssize_t expandsize, pos;
6394 int kind;
6395 void *data;
6396 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006398 if (!PyUnicode_Check(unicode)) {
6399 PyErr_BadArgument();
6400 return NULL;
6401 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006402 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006403 return NULL;
6404 kind = PyUnicode_KIND(unicode);
6405 data = PyUnicode_DATA(unicode);
6406 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006407 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6408 bytes, and 1 byte characters 4. */
6409 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006410
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006413
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006414 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 if (repr == NULL)
6416 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006417 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006418 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006420 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421 for (pos = 0; pos < len; pos++) {
6422 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 /* Map 32-bit characters to '\Uxxxxxxxx' */
6424 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006425 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006426 *p++ = '\\';
6427 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006428 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6429 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6430 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6431 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6432 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6433 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6434 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6435 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006436 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006438 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 *p++ = '\\';
6440 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006441 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6442 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6443 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6444 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 /* Copy everything else as-is */
6447 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 *p++ = (char) ch;
6449 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006450
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006451 assert(p > q);
6452 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006453 return NULL;
6454 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455}
6456
Alexander Belopolsky40018472011-02-26 01:02:56 +00006457PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006458PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6459 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006461 PyObject *result;
6462 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6463 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006464 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006465 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6466 Py_DECREF(tmp);
6467 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468}
6469
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006470/* --- Unicode Internal Codec ------------------------------------------- */
6471
Alexander Belopolsky40018472011-02-26 01:02:56 +00006472PyObject *
6473_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006474 Py_ssize_t size,
6475 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006476{
6477 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006478 Py_ssize_t startinpos;
6479 Py_ssize_t endinpos;
6480 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006481 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006482 const char *end;
6483 const char *reason;
6484 PyObject *errorHandler = NULL;
6485 PyObject *exc = NULL;
6486
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006487 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006488 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006489 1))
6490 return NULL;
6491
Thomas Wouters89f507f2006-12-13 04:49:30 +00006492 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006493 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006494 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006496 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006497 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006498 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006499 end = s + size;
6500
6501 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006502 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006503 Py_UCS4 ch;
6504 /* We copy the raw representation one byte at a time because the
6505 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006506 ((char *) &uch)[0] = s[0];
6507 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006508#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006509 ((char *) &uch)[2] = s[2];
6510 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006511#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006512 ch = uch;
6513
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006514 /* We have to sanity check the raw data, otherwise doom looms for
6515 some malformed UCS-4 data. */
6516 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006517#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006518 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006519#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006520 end-s < Py_UNICODE_SIZE
6521 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006523 startinpos = s - starts;
6524 if (end-s < Py_UNICODE_SIZE) {
6525 endinpos = end-starts;
6526 reason = "truncated input";
6527 }
6528 else {
6529 endinpos = s - starts + Py_UNICODE_SIZE;
6530 reason = "illegal code point (> 0x10FFFF)";
6531 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006532 if (unicode_decode_call_errorhandler(
6533 errors, &errorHandler,
6534 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006535 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006536 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006538 continue;
6539 }
6540
6541 s += Py_UNICODE_SIZE;
6542#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006543 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006544 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006545 Py_UNICODE uch2;
6546 ((char *) &uch2)[0] = s[0];
6547 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006548 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006549 {
Victor Stinner551ac952011-11-29 22:58:13 +01006550 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006551 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006552 }
6553 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006554#endif
6555
6556 if (unicode_putchar(&v, &outpos, ch) < 0)
6557 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006558 }
6559
Victor Stinner16e6a802011-12-12 13:24:15 +01006560 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006561 goto onError;
6562 Py_XDECREF(errorHandler);
6563 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006564 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006565
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006567 Py_XDECREF(v);
6568 Py_XDECREF(errorHandler);
6569 Py_XDECREF(exc);
6570 return NULL;
6571}
6572
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573/* --- Latin-1 Codec ------------------------------------------------------ */
6574
Alexander Belopolsky40018472011-02-26 01:02:56 +00006575PyObject *
6576PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006577 Py_ssize_t size,
6578 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006581 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582}
6583
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006584/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006585static void
6586make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006587 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006588 PyObject *unicode,
6589 Py_ssize_t startpos, Py_ssize_t endpos,
6590 const char *reason)
6591{
6592 if (*exceptionObject == NULL) {
6593 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006594 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006595 encoding, unicode, startpos, endpos, reason);
6596 }
6597 else {
6598 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6599 goto onError;
6600 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6601 goto onError;
6602 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6603 goto onError;
6604 return;
6605 onError:
6606 Py_DECREF(*exceptionObject);
6607 *exceptionObject = NULL;
6608 }
6609}
6610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006611/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006612static void
6613raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006614 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006615 PyObject *unicode,
6616 Py_ssize_t startpos, Py_ssize_t endpos,
6617 const char *reason)
6618{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006619 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006620 encoding, unicode, startpos, endpos, reason);
6621 if (*exceptionObject != NULL)
6622 PyCodec_StrictErrors(*exceptionObject);
6623}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006624
6625/* error handling callback helper:
6626 build arguments, call the callback and check the arguments,
6627 put the result into newpos and return the replacement string, which
6628 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006629static PyObject *
6630unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006631 PyObject **errorHandler,
6632 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006633 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006634 Py_ssize_t startpos, Py_ssize_t endpos,
6635 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006637 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006638 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639 PyObject *restuple;
6640 PyObject *resunicode;
6641
6642 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 }
6647
Benjamin Petersonbac79492012-01-14 13:34:47 -05006648 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 return NULL;
6650 len = PyUnicode_GET_LENGTH(unicode);
6651
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006652 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006653 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006654 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006656
6657 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006659 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006662 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 Py_DECREF(restuple);
6664 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006666 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 &resunicode, newpos)) {
6668 Py_DECREF(restuple);
6669 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006671 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6672 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6673 Py_DECREF(restuple);
6674 return NULL;
6675 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006677 *newpos = len + *newpos;
6678 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006679 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6680 Py_DECREF(restuple);
6681 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006682 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 Py_INCREF(resunicode);
6684 Py_DECREF(restuple);
6685 return resunicode;
6686}
6687
Alexander Belopolsky40018472011-02-26 01:02:56 +00006688static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006689unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006690 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006691 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006693 /* input state */
6694 Py_ssize_t pos=0, size;
6695 int kind;
6696 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 /* output object */
6698 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699 /* pointer into the output */
6700 char *str;
6701 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006702 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006703 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6704 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 PyObject *errorHandler = NULL;
6706 PyObject *exc = NULL;
6707 /* the following variable is used for caching string comparisons
6708 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6709 int known_errorHandler = -1;
6710
Benjamin Petersonbac79492012-01-14 13:34:47 -05006711 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006712 return NULL;
6713 size = PyUnicode_GET_LENGTH(unicode);
6714 kind = PyUnicode_KIND(unicode);
6715 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006716 /* allocate enough for a simple encoding without
6717 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006718 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006719 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006720 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006722 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006723 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 ressize = size;
6725
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006726 while (pos < size) {
6727 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006728
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 /* can we encode this? */
6730 if (c<limit) {
6731 /* no overflow check, because we know that the space is enough */
6732 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006734 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 Py_ssize_t requiredsize;
6737 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006738 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006740 Py_ssize_t collstart = pos;
6741 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006743 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 ++collend;
6745 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6746 if (known_errorHandler==-1) {
6747 if ((errors==NULL) || (!strcmp(errors, "strict")))
6748 known_errorHandler = 1;
6749 else if (!strcmp(errors, "replace"))
6750 known_errorHandler = 2;
6751 else if (!strcmp(errors, "ignore"))
6752 known_errorHandler = 3;
6753 else if (!strcmp(errors, "xmlcharrefreplace"))
6754 known_errorHandler = 4;
6755 else
6756 known_errorHandler = 0;
6757 }
6758 switch (known_errorHandler) {
6759 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006760 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 goto onError;
6762 case 2: /* replace */
6763 while (collstart++<collend)
6764 *str++ = '?'; /* fall through */
6765 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006766 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 break;
6768 case 4: /* xmlcharrefreplace */
6769 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006770 /* determine replacement size */
6771 for (i = collstart, repsize = 0; i < collend; ++i) {
6772 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6773 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006775 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006777 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006779 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006781 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006783 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006785 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006786 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006788 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006790 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 if (requiredsize > ressize) {
6792 if (requiredsize<2*ressize)
6793 requiredsize = 2*ressize;
6794 if (_PyBytes_Resize(&res, requiredsize))
6795 goto onError;
6796 str = PyBytes_AS_STRING(res) + respos;
6797 ressize = requiredsize;
6798 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006799 /* generate replacement */
6800 for (i = collstart; i < collend; ++i) {
6801 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006803 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 break;
6805 default:
6806 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006807 encoding, reason, unicode, &exc,
6808 collstart, collend, &newpos);
6809 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006810 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006812 if (PyBytes_Check(repunicode)) {
6813 /* Directly copy bytes result to output. */
6814 repsize = PyBytes_Size(repunicode);
6815 if (repsize > 1) {
6816 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006817 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006818 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6819 Py_DECREF(repunicode);
6820 goto onError;
6821 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006822 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006823 ressize += repsize-1;
6824 }
6825 memcpy(str, PyBytes_AsString(repunicode), repsize);
6826 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006827 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006828 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006829 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006830 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 /* need more space? (at least enough for what we
6832 have+the replacement+the rest of the string, so
6833 we won't have to check space for encodable characters) */
6834 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006835 repsize = PyUnicode_GET_LENGTH(repunicode);
6836 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 if (requiredsize > ressize) {
6838 if (requiredsize<2*ressize)
6839 requiredsize = 2*ressize;
6840 if (_PyBytes_Resize(&res, requiredsize)) {
6841 Py_DECREF(repunicode);
6842 goto onError;
6843 }
6844 str = PyBytes_AS_STRING(res) + respos;
6845 ressize = requiredsize;
6846 }
6847 /* check if there is anything unencodable in the replacement
6848 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006849 for (i = 0; repsize-->0; ++i, ++str) {
6850 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006852 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006853 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 Py_DECREF(repunicode);
6855 goto onError;
6856 }
6857 *str = (char)c;
6858 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006859 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006860 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006861 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006862 }
6863 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006864 /* Resize if we allocated to much */
6865 size = str - PyBytes_AS_STRING(res);
6866 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006867 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006868 if (_PyBytes_Resize(&res, size) < 0)
6869 goto onError;
6870 }
6871
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872 Py_XDECREF(errorHandler);
6873 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006874 return res;
6875
6876 onError:
6877 Py_XDECREF(res);
6878 Py_XDECREF(errorHandler);
6879 Py_XDECREF(exc);
6880 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006881}
6882
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006883/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006884PyObject *
6885PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006886 Py_ssize_t size,
6887 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006889 PyObject *result;
6890 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6891 if (unicode == NULL)
6892 return NULL;
6893 result = unicode_encode_ucs1(unicode, errors, 256);
6894 Py_DECREF(unicode);
6895 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896}
6897
Alexander Belopolsky40018472011-02-26 01:02:56 +00006898PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006899_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900{
6901 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 PyErr_BadArgument();
6903 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006905 if (PyUnicode_READY(unicode) == -1)
6906 return NULL;
6907 /* Fast path: if it is a one-byte string, construct
6908 bytes object directly. */
6909 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6910 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6911 PyUnicode_GET_LENGTH(unicode));
6912 /* Non-Latin-1 characters present. Defer to above function to
6913 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006914 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006915}
6916
6917PyObject*
6918PyUnicode_AsLatin1String(PyObject *unicode)
6919{
6920 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921}
6922
6923/* --- 7-bit ASCII Codec -------------------------------------------------- */
6924
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925PyObject *
6926PyUnicode_DecodeASCII(const char *s,
6927 Py_ssize_t size,
6928 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006931 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006932 int kind;
6933 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006934 Py_ssize_t startinpos;
6935 Py_ssize_t endinpos;
6936 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006938 int has_error;
6939 const unsigned char *p = (const unsigned char *)s;
6940 const unsigned char *end = p + size;
6941 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006942 PyObject *errorHandler = NULL;
6943 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006944
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006945 if (size == 0) {
6946 Py_INCREF(unicode_empty);
6947 return unicode_empty;
6948 }
6949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006951 if (size == 1 && (unsigned char)s[0] < 128)
6952 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006953
Victor Stinner702c7342011-10-05 13:50:52 +02006954 has_error = 0;
6955 while (p < end && !has_error) {
6956 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6957 an explanation. */
6958 if (!((size_t) p & LONG_PTR_MASK)) {
6959 /* Help register allocation */
6960 register const unsigned char *_p = p;
6961 while (_p < aligned_end) {
6962 unsigned long value = *(unsigned long *) _p;
6963 if (value & ASCII_CHAR_MASK) {
6964 has_error = 1;
6965 break;
6966 }
6967 _p += SIZEOF_LONG;
6968 }
6969 if (_p == end)
6970 break;
6971 if (has_error)
6972 break;
6973 p = _p;
6974 }
6975 if (*p & 0x80) {
6976 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006977 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006978 }
6979 else {
6980 ++p;
6981 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006982 }
Victor Stinner702c7342011-10-05 13:50:52 +02006983 if (!has_error)
6984 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006985
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006986 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006989 kind = PyUnicode_KIND(v);
6990 data = PyUnicode_DATA(v);
6991 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006992 e = s + size;
6993 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 register unsigned char c = (unsigned char)*s;
6995 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006996 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 ++s;
6998 }
6999 else {
7000 startinpos = s-starts;
7001 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 if (unicode_decode_call_errorhandler(
7003 errors, &errorHandler,
7004 "ascii", "ordinal not in range(128)",
7005 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007006 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007008 kind = PyUnicode_KIND(v);
7009 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007012 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007013 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007014 Py_XDECREF(errorHandler);
7015 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007016 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007017 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007018
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007021 Py_XDECREF(errorHandler);
7022 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 return NULL;
7024}
7025
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007026/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007027PyObject *
7028PyUnicode_EncodeASCII(const Py_UNICODE *p,
7029 Py_ssize_t size,
7030 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007032 PyObject *result;
7033 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7034 if (unicode == NULL)
7035 return NULL;
7036 result = unicode_encode_ucs1(unicode, errors, 128);
7037 Py_DECREF(unicode);
7038 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039}
7040
Alexander Belopolsky40018472011-02-26 01:02:56 +00007041PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007042_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043{
7044 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 PyErr_BadArgument();
7046 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007048 if (PyUnicode_READY(unicode) == -1)
7049 return NULL;
7050 /* Fast path: if it is an ASCII-only string, construct bytes object
7051 directly. Else defer to above function to raise the exception. */
7052 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7053 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7054 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007055 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007056}
7057
7058PyObject *
7059PyUnicode_AsASCIIString(PyObject *unicode)
7060{
7061 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062}
7063
Victor Stinner99b95382011-07-04 14:23:54 +02007064#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007065
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007066/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007067
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007068#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007069#define NEED_RETRY
7070#endif
7071
Victor Stinner3a50e702011-10-18 21:21:00 +02007072#ifndef WC_ERR_INVALID_CHARS
7073# define WC_ERR_INVALID_CHARS 0x0080
7074#endif
7075
7076static char*
7077code_page_name(UINT code_page, PyObject **obj)
7078{
7079 *obj = NULL;
7080 if (code_page == CP_ACP)
7081 return "mbcs";
7082 if (code_page == CP_UTF7)
7083 return "CP_UTF7";
7084 if (code_page == CP_UTF8)
7085 return "CP_UTF8";
7086
7087 *obj = PyBytes_FromFormat("cp%u", code_page);
7088 if (*obj == NULL)
7089 return NULL;
7090 return PyBytes_AS_STRING(*obj);
7091}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007092
Alexander Belopolsky40018472011-02-26 01:02:56 +00007093static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007094is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095{
7096 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007097 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007098
Victor Stinner3a50e702011-10-18 21:21:00 +02007099 if (!IsDBCSLeadByteEx(code_page, *curr))
7100 return 0;
7101
7102 prev = CharPrevExA(code_page, s, curr, 0);
7103 if (prev == curr)
7104 return 1;
7105 /* FIXME: This code is limited to "true" double-byte encodings,
7106 as it assumes an incomplete character consists of a single
7107 byte. */
7108 if (curr - prev == 2)
7109 return 1;
7110 if (!IsDBCSLeadByteEx(code_page, *prev))
7111 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007112 return 0;
7113}
7114
Victor Stinner3a50e702011-10-18 21:21:00 +02007115static DWORD
7116decode_code_page_flags(UINT code_page)
7117{
7118 if (code_page == CP_UTF7) {
7119 /* The CP_UTF7 decoder only supports flags=0 */
7120 return 0;
7121 }
7122 else
7123 return MB_ERR_INVALID_CHARS;
7124}
7125
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007126/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007127 * Decode a byte string from a Windows code page into unicode object in strict
7128 * mode.
7129 *
7130 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7131 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007132 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007133static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007134decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007135 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 const char *in,
7137 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138{
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007140 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007141 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007142
7143 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007144 assert(insize > 0);
7145 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7146 if (outsize <= 0)
7147 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148
7149 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007150 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007151 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007152 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007153 if (*v == NULL)
7154 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007156 }
7157 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007158 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007160 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007161 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007162 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007163 }
7164
7165 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7167 if (outsize <= 0)
7168 goto error;
7169 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007170
Victor Stinner3a50e702011-10-18 21:21:00 +02007171error:
7172 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7173 return -2;
7174 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007175 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007176}
7177
Victor Stinner3a50e702011-10-18 21:21:00 +02007178/*
7179 * Decode a byte string from a code page into unicode object with an error
7180 * handler.
7181 *
7182 * Returns consumed size if succeed, or raise a WindowsError or
7183 * UnicodeDecodeError exception and returns -1 on error.
7184 */
7185static int
7186decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007187 PyObject **v,
7188 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 const char *errors)
7190{
7191 const char *startin = in;
7192 const char *endin = in + size;
7193 const DWORD flags = decode_code_page_flags(code_page);
7194 /* Ideally, we should get reason from FormatMessage. This is the Windows
7195 2000 English version of the message. */
7196 const char *reason = "No mapping for the Unicode character exists "
7197 "in the target code page.";
7198 /* each step cannot decode more than 1 character, but a character can be
7199 represented as a surrogate pair */
7200 wchar_t buffer[2], *startout, *out;
7201 int insize, outsize;
7202 PyObject *errorHandler = NULL;
7203 PyObject *exc = NULL;
7204 PyObject *encoding_obj = NULL;
7205 char *encoding;
7206 DWORD err;
7207 int ret = -1;
7208
7209 assert(size > 0);
7210
7211 encoding = code_page_name(code_page, &encoding_obj);
7212 if (encoding == NULL)
7213 return -1;
7214
7215 if (errors == NULL || strcmp(errors, "strict") == 0) {
7216 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7217 UnicodeDecodeError. */
7218 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7219 if (exc != NULL) {
7220 PyCodec_StrictErrors(exc);
7221 Py_CLEAR(exc);
7222 }
7223 goto error;
7224 }
7225
7226 if (*v == NULL) {
7227 /* Create unicode object */
7228 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7229 PyErr_NoMemory();
7230 goto error;
7231 }
Victor Stinnerab595942011-12-17 04:59:06 +01007232 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007233 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 if (*v == NULL)
7235 goto error;
7236 startout = PyUnicode_AS_UNICODE(*v);
7237 }
7238 else {
7239 /* Extend unicode object */
7240 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7241 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7242 PyErr_NoMemory();
7243 goto error;
7244 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007245 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007246 goto error;
7247 startout = PyUnicode_AS_UNICODE(*v) + n;
7248 }
7249
7250 /* Decode the byte string character per character */
7251 out = startout;
7252 while (in < endin)
7253 {
7254 /* Decode a character */
7255 insize = 1;
7256 do
7257 {
7258 outsize = MultiByteToWideChar(code_page, flags,
7259 in, insize,
7260 buffer, Py_ARRAY_LENGTH(buffer));
7261 if (outsize > 0)
7262 break;
7263 err = GetLastError();
7264 if (err != ERROR_NO_UNICODE_TRANSLATION
7265 && err != ERROR_INSUFFICIENT_BUFFER)
7266 {
7267 PyErr_SetFromWindowsErr(0);
7268 goto error;
7269 }
7270 insize++;
7271 }
7272 /* 4=maximum length of a UTF-8 sequence */
7273 while (insize <= 4 && (in + insize) <= endin);
7274
7275 if (outsize <= 0) {
7276 Py_ssize_t startinpos, endinpos, outpos;
7277
7278 startinpos = in - startin;
7279 endinpos = startinpos + 1;
7280 outpos = out - PyUnicode_AS_UNICODE(*v);
7281 if (unicode_decode_call_errorhandler(
7282 errors, &errorHandler,
7283 encoding, reason,
7284 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007285 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 {
7287 goto error;
7288 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007289 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007290 }
7291 else {
7292 in += insize;
7293 memcpy(out, buffer, outsize * sizeof(wchar_t));
7294 out += outsize;
7295 }
7296 }
7297
7298 /* write a NUL character at the end */
7299 *out = 0;
7300
7301 /* Extend unicode object */
7302 outsize = out - startout;
7303 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007304 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007305 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007306 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007307
7308error:
7309 Py_XDECREF(encoding_obj);
7310 Py_XDECREF(errorHandler);
7311 Py_XDECREF(exc);
7312 return ret;
7313}
7314
Victor Stinner3a50e702011-10-18 21:21:00 +02007315static PyObject *
7316decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007317 const char *s, Py_ssize_t size,
7318 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319{
Victor Stinner76a31a62011-11-04 00:05:13 +01007320 PyObject *v = NULL;
7321 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007322
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 if (code_page < 0) {
7324 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7325 return NULL;
7326 }
7327
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007328 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007329 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007330
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 do
7332 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007334 if (size > INT_MAX) {
7335 chunk_size = INT_MAX;
7336 final = 0;
7337 done = 0;
7338 }
7339 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007340#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007341 {
7342 chunk_size = (int)size;
7343 final = (consumed == NULL);
7344 done = 1;
7345 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007346
Victor Stinner76a31a62011-11-04 00:05:13 +01007347 /* Skip trailing lead-byte unless 'final' is set */
7348 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7349 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007350
Victor Stinner76a31a62011-11-04 00:05:13 +01007351 if (chunk_size == 0 && done) {
7352 if (v != NULL)
7353 break;
7354 Py_INCREF(unicode_empty);
7355 return unicode_empty;
7356 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007357
Victor Stinner76a31a62011-11-04 00:05:13 +01007358
7359 converted = decode_code_page_strict(code_page, &v,
7360 s, chunk_size);
7361 if (converted == -2)
7362 converted = decode_code_page_errors(code_page, &v,
7363 s, chunk_size,
7364 errors);
7365 assert(converted != 0);
7366
7367 if (converted < 0) {
7368 Py_XDECREF(v);
7369 return NULL;
7370 }
7371
7372 if (consumed)
7373 *consumed += converted;
7374
7375 s += converted;
7376 size -= converted;
7377 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007378
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007379 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007380}
7381
Alexander Belopolsky40018472011-02-26 01:02:56 +00007382PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007383PyUnicode_DecodeCodePageStateful(int code_page,
7384 const char *s,
7385 Py_ssize_t size,
7386 const char *errors,
7387 Py_ssize_t *consumed)
7388{
7389 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7390}
7391
7392PyObject *
7393PyUnicode_DecodeMBCSStateful(const char *s,
7394 Py_ssize_t size,
7395 const char *errors,
7396 Py_ssize_t *consumed)
7397{
7398 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7399}
7400
7401PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007402PyUnicode_DecodeMBCS(const char *s,
7403 Py_ssize_t size,
7404 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007405{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007406 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7407}
7408
Victor Stinner3a50e702011-10-18 21:21:00 +02007409static DWORD
7410encode_code_page_flags(UINT code_page, const char *errors)
7411{
7412 if (code_page == CP_UTF8) {
7413 if (winver.dwMajorVersion >= 6)
7414 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7415 and later */
7416 return WC_ERR_INVALID_CHARS;
7417 else
7418 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7419 return 0;
7420 }
7421 else if (code_page == CP_UTF7) {
7422 /* CP_UTF7 only supports flags=0 */
7423 return 0;
7424 }
7425 else {
7426 if (errors != NULL && strcmp(errors, "replace") == 0)
7427 return 0;
7428 else
7429 return WC_NO_BEST_FIT_CHARS;
7430 }
7431}
7432
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007433/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 * Encode a Unicode string to a Windows code page into a byte string in strict
7435 * mode.
7436 *
7437 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7438 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007439 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007440static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007441encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007442 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007444{
Victor Stinner554f3f02010-06-16 23:33:54 +00007445 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007446 BOOL *pusedDefaultChar = &usedDefaultChar;
7447 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007448 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007449 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007450 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 const DWORD flags = encode_code_page_flags(code_page, NULL);
7452 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007453 /* Create a substring so that we can get the UTF-16 representation
7454 of just the slice under consideration. */
7455 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007456
Martin v. Löwis3d325192011-11-04 18:23:06 +01007457 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007458
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007460 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007462 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007463
Victor Stinner2fc507f2011-11-04 20:06:39 +01007464 substring = PyUnicode_Substring(unicode, offset, offset+len);
7465 if (substring == NULL)
7466 return -1;
7467 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7468 if (p == NULL) {
7469 Py_DECREF(substring);
7470 return -1;
7471 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007472
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007473 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 outsize = WideCharToMultiByte(code_page, flags,
7475 p, size,
7476 NULL, 0,
7477 NULL, pusedDefaultChar);
7478 if (outsize <= 0)
7479 goto error;
7480 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007481 if (pusedDefaultChar && *pusedDefaultChar) {
7482 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007484 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007485
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007489 if (*outbytes == NULL) {
7490 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007492 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007493 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007494 }
7495 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 const Py_ssize_t n = PyBytes_Size(*outbytes);
7498 if (outsize > PY_SSIZE_T_MAX - n) {
7499 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007500 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007503 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7504 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007505 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007506 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007507 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007508 }
7509
7510 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 outsize = WideCharToMultiByte(code_page, flags,
7512 p, size,
7513 out, outsize,
7514 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007515 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 if (outsize <= 0)
7517 goto error;
7518 if (pusedDefaultChar && *pusedDefaultChar)
7519 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007520 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007521
Victor Stinner3a50e702011-10-18 21:21:00 +02007522error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007523 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7525 return -2;
7526 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007527 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007528}
7529
Victor Stinner3a50e702011-10-18 21:21:00 +02007530/*
7531 * Encode a Unicode string to a Windows code page into a byte string using a
7532 * error handler.
7533 *
7534 * Returns consumed characters if succeed, or raise a WindowsError and returns
7535 * -1 on other error.
7536 */
7537static int
7538encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007539 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007540 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007541{
Victor Stinner3a50e702011-10-18 21:21:00 +02007542 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007543 Py_ssize_t pos = unicode_offset;
7544 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 /* Ideally, we should get reason from FormatMessage. This is the Windows
7546 2000 English version of the message. */
7547 const char *reason = "invalid character";
7548 /* 4=maximum length of a UTF-8 sequence */
7549 char buffer[4];
7550 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7551 Py_ssize_t outsize;
7552 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007553 PyObject *errorHandler = NULL;
7554 PyObject *exc = NULL;
7555 PyObject *encoding_obj = NULL;
7556 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007557 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 PyObject *rep;
7559 int ret = -1;
7560
7561 assert(insize > 0);
7562
7563 encoding = code_page_name(code_page, &encoding_obj);
7564 if (encoding == NULL)
7565 return -1;
7566
7567 if (errors == NULL || strcmp(errors, "strict") == 0) {
7568 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7569 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007570 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007571 if (exc != NULL) {
7572 PyCodec_StrictErrors(exc);
7573 Py_DECREF(exc);
7574 }
7575 Py_XDECREF(encoding_obj);
7576 return -1;
7577 }
7578
7579 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7580 pusedDefaultChar = &usedDefaultChar;
7581 else
7582 pusedDefaultChar = NULL;
7583
7584 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7585 PyErr_NoMemory();
7586 goto error;
7587 }
7588 outsize = insize * Py_ARRAY_LENGTH(buffer);
7589
7590 if (*outbytes == NULL) {
7591 /* Create string object */
7592 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7593 if (*outbytes == NULL)
7594 goto error;
7595 out = PyBytes_AS_STRING(*outbytes);
7596 }
7597 else {
7598 /* Extend string object */
7599 Py_ssize_t n = PyBytes_Size(*outbytes);
7600 if (n > PY_SSIZE_T_MAX - outsize) {
7601 PyErr_NoMemory();
7602 goto error;
7603 }
7604 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7605 goto error;
7606 out = PyBytes_AS_STRING(*outbytes) + n;
7607 }
7608
7609 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007610 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007612 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7613 wchar_t chars[2];
7614 int charsize;
7615 if (ch < 0x10000) {
7616 chars[0] = (wchar_t)ch;
7617 charsize = 1;
7618 }
7619 else {
7620 ch -= 0x10000;
7621 chars[0] = 0xd800 + (ch >> 10);
7622 chars[1] = 0xdc00 + (ch & 0x3ff);
7623 charsize = 2;
7624 }
7625
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007627 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 buffer, Py_ARRAY_LENGTH(buffer),
7629 NULL, pusedDefaultChar);
7630 if (outsize > 0) {
7631 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7632 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007633 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 memcpy(out, buffer, outsize);
7635 out += outsize;
7636 continue;
7637 }
7638 }
7639 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7640 PyErr_SetFromWindowsErr(0);
7641 goto error;
7642 }
7643
Victor Stinner3a50e702011-10-18 21:21:00 +02007644 rep = unicode_encode_call_errorhandler(
7645 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007646 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007647 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007648 if (rep == NULL)
7649 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007650 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007651
7652 if (PyBytes_Check(rep)) {
7653 outsize = PyBytes_GET_SIZE(rep);
7654 if (outsize != 1) {
7655 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7656 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7657 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7658 Py_DECREF(rep);
7659 goto error;
7660 }
7661 out = PyBytes_AS_STRING(*outbytes) + offset;
7662 }
7663 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7664 out += outsize;
7665 }
7666 else {
7667 Py_ssize_t i;
7668 enum PyUnicode_Kind kind;
7669 void *data;
7670
Benjamin Petersonbac79492012-01-14 13:34:47 -05007671 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007672 Py_DECREF(rep);
7673 goto error;
7674 }
7675
7676 outsize = PyUnicode_GET_LENGTH(rep);
7677 if (outsize != 1) {
7678 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7679 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7680 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7681 Py_DECREF(rep);
7682 goto error;
7683 }
7684 out = PyBytes_AS_STRING(*outbytes) + offset;
7685 }
7686 kind = PyUnicode_KIND(rep);
7687 data = PyUnicode_DATA(rep);
7688 for (i=0; i < outsize; i++) {
7689 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7690 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007691 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007692 encoding, unicode,
7693 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007694 "unable to encode error handler result to ASCII");
7695 Py_DECREF(rep);
7696 goto error;
7697 }
7698 *out = (unsigned char)ch;
7699 out++;
7700 }
7701 }
7702 Py_DECREF(rep);
7703 }
7704 /* write a NUL byte */
7705 *out = 0;
7706 outsize = out - PyBytes_AS_STRING(*outbytes);
7707 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7708 if (_PyBytes_Resize(outbytes, outsize) < 0)
7709 goto error;
7710 ret = 0;
7711
7712error:
7713 Py_XDECREF(encoding_obj);
7714 Py_XDECREF(errorHandler);
7715 Py_XDECREF(exc);
7716 return ret;
7717}
7718
Victor Stinner3a50e702011-10-18 21:21:00 +02007719static PyObject *
7720encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007721 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007722 const char *errors)
7723{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007724 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007725 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007726 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007727 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007728
Benjamin Petersonbac79492012-01-14 13:34:47 -05007729 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007730 return NULL;
7731 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007732
Victor Stinner3a50e702011-10-18 21:21:00 +02007733 if (code_page < 0) {
7734 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7735 return NULL;
7736 }
7737
Martin v. Löwis3d325192011-11-04 18:23:06 +01007738 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007739 return PyBytes_FromStringAndSize(NULL, 0);
7740
Victor Stinner7581cef2011-11-03 22:32:33 +01007741 offset = 0;
7742 do
7743 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007744#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007745 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007746 chunks. */
7747 if (len > INT_MAX/2) {
7748 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007749 done = 0;
7750 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007751 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007752#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007753 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007754 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007755 done = 1;
7756 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007757
Victor Stinner76a31a62011-11-04 00:05:13 +01007758 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007759 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007760 errors);
7761 if (ret == -2)
7762 ret = encode_code_page_errors(code_page, &outbytes,
7763 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007764 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007765 if (ret < 0) {
7766 Py_XDECREF(outbytes);
7767 return NULL;
7768 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007769
Victor Stinner7581cef2011-11-03 22:32:33 +01007770 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007771 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007772 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007773
Victor Stinner3a50e702011-10-18 21:21:00 +02007774 return outbytes;
7775}
7776
7777PyObject *
7778PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7779 Py_ssize_t size,
7780 const char *errors)
7781{
Victor Stinner7581cef2011-11-03 22:32:33 +01007782 PyObject *unicode, *res;
7783 unicode = PyUnicode_FromUnicode(p, size);
7784 if (unicode == NULL)
7785 return NULL;
7786 res = encode_code_page(CP_ACP, unicode, errors);
7787 Py_DECREF(unicode);
7788 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007789}
7790
7791PyObject *
7792PyUnicode_EncodeCodePage(int code_page,
7793 PyObject *unicode,
7794 const char *errors)
7795{
Victor Stinner7581cef2011-11-03 22:32:33 +01007796 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007797}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007798
Alexander Belopolsky40018472011-02-26 01:02:56 +00007799PyObject *
7800PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007801{
7802 if (!PyUnicode_Check(unicode)) {
7803 PyErr_BadArgument();
7804 return NULL;
7805 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007806 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007807}
7808
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007809#undef NEED_RETRY
7810
Victor Stinner99b95382011-07-04 14:23:54 +02007811#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007812
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813/* --- Character Mapping Codec -------------------------------------------- */
7814
Alexander Belopolsky40018472011-02-26 01:02:56 +00007815PyObject *
7816PyUnicode_DecodeCharmap(const char *s,
7817 Py_ssize_t size,
7818 PyObject *mapping,
7819 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007822 Py_ssize_t startinpos;
7823 Py_ssize_t endinpos;
7824 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007825 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007826 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007827 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007828 PyObject *errorHandler = NULL;
7829 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007830
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 /* Default to Latin-1 */
7832 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007835 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007839 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007840 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007841 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007842 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007843 Py_ssize_t maplen;
7844 enum PyUnicode_Kind kind;
7845 void *data;
7846 Py_UCS4 x;
7847
Benjamin Petersonbac79492012-01-14 13:34:47 -05007848 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007849 return NULL;
7850
7851 maplen = PyUnicode_GET_LENGTH(mapping);
7852 data = PyUnicode_DATA(mapping);
7853 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 while (s < e) {
7855 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007858 x = PyUnicode_READ(kind, data, ch);
7859 else
7860 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007862 if (x == 0xfffe)
7863 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 startinpos = s-starts;
7866 endinpos = startinpos+1;
7867 if (unicode_decode_call_errorhandler(
7868 errors, &errorHandler,
7869 "charmap", "character maps to <undefined>",
7870 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007871 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 goto onError;
7873 }
7874 continue;
7875 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007876
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007877 if (unicode_putchar(&v, &outpos, x) < 0)
7878 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007879 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007880 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007881 }
7882 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 while (s < e) {
7884 unsigned char ch = *s;
7885 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007886
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7888 w = PyLong_FromLong((long)ch);
7889 if (w == NULL)
7890 goto onError;
7891 x = PyObject_GetItem(mapping, w);
7892 Py_DECREF(w);
7893 if (x == NULL) {
7894 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7895 /* No mapping found means: mapping is undefined. */
7896 PyErr_Clear();
7897 x = Py_None;
7898 Py_INCREF(x);
7899 } else
7900 goto onError;
7901 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007902
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 /* Apply mapping */
7904 if (PyLong_Check(x)) {
7905 long value = PyLong_AS_LONG(x);
7906 if (value < 0 || value > 65535) {
7907 PyErr_SetString(PyExc_TypeError,
7908 "character mapping must be in range(65536)");
7909 Py_DECREF(x);
7910 goto onError;
7911 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007912 if (unicode_putchar(&v, &outpos, value) < 0)
7913 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 }
7915 else if (x == Py_None) {
7916 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 startinpos = s-starts;
7918 endinpos = startinpos+1;
7919 if (unicode_decode_call_errorhandler(
7920 errors, &errorHandler,
7921 "charmap", "character maps to <undefined>",
7922 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007923 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007924 Py_DECREF(x);
7925 goto onError;
7926 }
7927 Py_DECREF(x);
7928 continue;
7929 }
7930 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007931 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007932
Benjamin Petersonbac79492012-01-14 13:34:47 -05007933 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007934 goto onError;
7935 targetsize = PyUnicode_GET_LENGTH(x);
7936
7937 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007939 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007940 PyUnicode_READ_CHAR(x, 0)) < 0)
7941 goto onError;
7942 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 else if (targetsize > 1) {
7944 /* 1-n mapping */
7945 if (targetsize > extrachars) {
7946 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 Py_ssize_t needed = (targetsize - extrachars) + \
7948 (targetsize << 2);
7949 extrachars += needed;
7950 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007951 if (unicode_resize(&v,
7952 PyUnicode_GET_LENGTH(v) + needed) < 0)
7953 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 Py_DECREF(x);
7955 goto onError;
7956 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007958 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7959 goto onError;
7960 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7961 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 extrachars -= targetsize;
7963 }
7964 /* 1-0 mapping: skip the character */
7965 }
7966 else {
7967 /* wrong return value */
7968 PyErr_SetString(PyExc_TypeError,
7969 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007970 Py_DECREF(x);
7971 goto onError;
7972 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 Py_DECREF(x);
7974 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007975 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007977 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007978 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007979 Py_XDECREF(errorHandler);
7980 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007981 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007982
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007984 Py_XDECREF(errorHandler);
7985 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 Py_XDECREF(v);
7987 return NULL;
7988}
7989
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007990/* Charmap encoding: the lookup table */
7991
Alexander Belopolsky40018472011-02-26 01:02:56 +00007992struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 PyObject_HEAD
7994 unsigned char level1[32];
7995 int count2, count3;
7996 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007997};
7998
7999static PyObject*
8000encoding_map_size(PyObject *obj, PyObject* args)
8001{
8002 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008003 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008005}
8006
8007static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008008 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 PyDoc_STR("Return the size (in bytes) of this object") },
8010 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008011};
8012
8013static void
8014encoding_map_dealloc(PyObject* o)
8015{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008017}
8018
8019static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008020 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 "EncodingMap", /*tp_name*/
8022 sizeof(struct encoding_map), /*tp_basicsize*/
8023 0, /*tp_itemsize*/
8024 /* methods */
8025 encoding_map_dealloc, /*tp_dealloc*/
8026 0, /*tp_print*/
8027 0, /*tp_getattr*/
8028 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008029 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 0, /*tp_repr*/
8031 0, /*tp_as_number*/
8032 0, /*tp_as_sequence*/
8033 0, /*tp_as_mapping*/
8034 0, /*tp_hash*/
8035 0, /*tp_call*/
8036 0, /*tp_str*/
8037 0, /*tp_getattro*/
8038 0, /*tp_setattro*/
8039 0, /*tp_as_buffer*/
8040 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8041 0, /*tp_doc*/
8042 0, /*tp_traverse*/
8043 0, /*tp_clear*/
8044 0, /*tp_richcompare*/
8045 0, /*tp_weaklistoffset*/
8046 0, /*tp_iter*/
8047 0, /*tp_iternext*/
8048 encoding_map_methods, /*tp_methods*/
8049 0, /*tp_members*/
8050 0, /*tp_getset*/
8051 0, /*tp_base*/
8052 0, /*tp_dict*/
8053 0, /*tp_descr_get*/
8054 0, /*tp_descr_set*/
8055 0, /*tp_dictoffset*/
8056 0, /*tp_init*/
8057 0, /*tp_alloc*/
8058 0, /*tp_new*/
8059 0, /*tp_free*/
8060 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008061};
8062
8063PyObject*
8064PyUnicode_BuildEncodingMap(PyObject* string)
8065{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008066 PyObject *result;
8067 struct encoding_map *mresult;
8068 int i;
8069 int need_dict = 0;
8070 unsigned char level1[32];
8071 unsigned char level2[512];
8072 unsigned char *mlevel1, *mlevel2, *mlevel3;
8073 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008074 int kind;
8075 void *data;
8076 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008078 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079 PyErr_BadArgument();
8080 return NULL;
8081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008082 kind = PyUnicode_KIND(string);
8083 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008084 memset(level1, 0xFF, sizeof level1);
8085 memset(level2, 0xFF, sizeof level2);
8086
8087 /* If there isn't a one-to-one mapping of NULL to \0,
8088 or if there are non-BMP characters, we need to use
8089 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008090 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008091 need_dict = 1;
8092 for (i = 1; i < 256; i++) {
8093 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008094 ch = PyUnicode_READ(kind, data, i);
8095 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008096 need_dict = 1;
8097 break;
8098 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008099 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100 /* unmapped character */
8101 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008102 l1 = ch >> 11;
8103 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008104 if (level1[l1] == 0xFF)
8105 level1[l1] = count2++;
8106 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008107 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 }
8109
8110 if (count2 >= 0xFF || count3 >= 0xFF)
8111 need_dict = 1;
8112
8113 if (need_dict) {
8114 PyObject *result = PyDict_New();
8115 PyObject *key, *value;
8116 if (!result)
8117 return NULL;
8118 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008120 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008121 if (!key || !value)
8122 goto failed1;
8123 if (PyDict_SetItem(result, key, value) == -1)
8124 goto failed1;
8125 Py_DECREF(key);
8126 Py_DECREF(value);
8127 }
8128 return result;
8129 failed1:
8130 Py_XDECREF(key);
8131 Py_XDECREF(value);
8132 Py_DECREF(result);
8133 return NULL;
8134 }
8135
8136 /* Create a three-level trie */
8137 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8138 16*count2 + 128*count3 - 1);
8139 if (!result)
8140 return PyErr_NoMemory();
8141 PyObject_Init(result, &EncodingMapType);
8142 mresult = (struct encoding_map*)result;
8143 mresult->count2 = count2;
8144 mresult->count3 = count3;
8145 mlevel1 = mresult->level1;
8146 mlevel2 = mresult->level23;
8147 mlevel3 = mresult->level23 + 16*count2;
8148 memcpy(mlevel1, level1, 32);
8149 memset(mlevel2, 0xFF, 16*count2);
8150 memset(mlevel3, 0, 128*count3);
8151 count3 = 0;
8152 for (i = 1; i < 256; i++) {
8153 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008154 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155 /* unmapped character */
8156 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008157 o1 = PyUnicode_READ(kind, data, i)>>11;
8158 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 i2 = 16*mlevel1[o1] + o2;
8160 if (mlevel2[i2] == 0xFF)
8161 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008162 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 i3 = 128*mlevel2[i2] + o3;
8164 mlevel3[i3] = i;
8165 }
8166 return result;
8167}
8168
8169static int
Victor Stinner22168992011-11-20 17:09:18 +01008170encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008171{
8172 struct encoding_map *map = (struct encoding_map*)mapping;
8173 int l1 = c>>11;
8174 int l2 = (c>>7) & 0xF;
8175 int l3 = c & 0x7F;
8176 int i;
8177
Victor Stinner22168992011-11-20 17:09:18 +01008178 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008180 if (c == 0)
8181 return 0;
8182 /* level 1*/
8183 i = map->level1[l1];
8184 if (i == 0xFF) {
8185 return -1;
8186 }
8187 /* level 2*/
8188 i = map->level23[16*i+l2];
8189 if (i == 0xFF) {
8190 return -1;
8191 }
8192 /* level 3 */
8193 i = map->level23[16*map->count2 + 128*i + l3];
8194 if (i == 0) {
8195 return -1;
8196 }
8197 return i;
8198}
8199
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008200/* Lookup the character ch in the mapping. If the character
8201 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008202 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008203static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008204charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205{
Christian Heimes217cfd12007-12-02 14:31:20 +00008206 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008207 PyObject *x;
8208
8209 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008211 x = PyObject_GetItem(mapping, w);
8212 Py_DECREF(w);
8213 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8215 /* No mapping found means: mapping is undefined. */
8216 PyErr_Clear();
8217 x = Py_None;
8218 Py_INCREF(x);
8219 return x;
8220 } else
8221 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008223 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008225 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 long value = PyLong_AS_LONG(x);
8227 if (value < 0 || value > 255) {
8228 PyErr_SetString(PyExc_TypeError,
8229 "character mapping must be in range(256)");
8230 Py_DECREF(x);
8231 return NULL;
8232 }
8233 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008235 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 /* wrong return value */
8239 PyErr_Format(PyExc_TypeError,
8240 "character mapping must return integer, bytes or None, not %.400s",
8241 x->ob_type->tp_name);
8242 Py_DECREF(x);
8243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 }
8245}
8246
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008247static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008248charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008249{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008250 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8251 /* exponentially overallocate to minimize reallocations */
8252 if (requiredsize < 2*outsize)
8253 requiredsize = 2*outsize;
8254 if (_PyBytes_Resize(outobj, requiredsize))
8255 return -1;
8256 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008257}
8258
Benjamin Peterson14339b62009-01-31 16:36:08 +00008259typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008261} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008263 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 space is available. Return a new reference to the object that
8265 was put in the output buffer, or Py_None, if the mapping was undefined
8266 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008267 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008268static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008269charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008270 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008272 PyObject *rep;
8273 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008274 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275
Christian Heimes90aa7642007-12-19 02:45:37 +00008276 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008277 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008279 if (res == -1)
8280 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 if (outsize<requiredsize)
8282 if (charmapencode_resize(outobj, outpos, requiredsize))
8283 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008284 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 outstart[(*outpos)++] = (char)res;
8286 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287 }
8288
8289 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008290 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008292 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 Py_DECREF(rep);
8294 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008295 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 if (PyLong_Check(rep)) {
8297 Py_ssize_t requiredsize = *outpos+1;
8298 if (outsize<requiredsize)
8299 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8300 Py_DECREF(rep);
8301 return enc_EXCEPTION;
8302 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008303 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008305 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 else {
8307 const char *repchars = PyBytes_AS_STRING(rep);
8308 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8309 Py_ssize_t requiredsize = *outpos+repsize;
8310 if (outsize<requiredsize)
8311 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8312 Py_DECREF(rep);
8313 return enc_EXCEPTION;
8314 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008315 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 memcpy(outstart + *outpos, repchars, repsize);
8317 *outpos += repsize;
8318 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008320 Py_DECREF(rep);
8321 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322}
8323
8324/* handle an error in PyUnicode_EncodeCharmap
8325 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326static int
8327charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008328 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008330 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008331 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332{
8333 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008334 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008335 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008336 enum PyUnicode_Kind kind;
8337 void *data;
8338 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008340 Py_ssize_t collstartpos = *inpos;
8341 Py_ssize_t collendpos = *inpos+1;
8342 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008343 char *encoding = "charmap";
8344 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008345 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008346 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008347 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348
Benjamin Petersonbac79492012-01-14 13:34:47 -05008349 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008350 return -1;
8351 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 /* find all unencodable characters */
8353 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008354 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008355 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008356 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008357 val = encoding_map_lookup(ch, mapping);
8358 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 break;
8360 ++collendpos;
8361 continue;
8362 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008363
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008364 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8365 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 if (rep==NULL)
8367 return -1;
8368 else if (rep!=Py_None) {
8369 Py_DECREF(rep);
8370 break;
8371 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008372 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 }
8375 /* cache callback name lookup
8376 * (if not done yet, i.e. it's the first error) */
8377 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 if ((errors==NULL) || (!strcmp(errors, "strict")))
8379 *known_errorHandler = 1;
8380 else if (!strcmp(errors, "replace"))
8381 *known_errorHandler = 2;
8382 else if (!strcmp(errors, "ignore"))
8383 *known_errorHandler = 3;
8384 else if (!strcmp(errors, "xmlcharrefreplace"))
8385 *known_errorHandler = 4;
8386 else
8387 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388 }
8389 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008391 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008392 return -1;
8393 case 2: /* replace */
8394 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 x = charmapencode_output('?', mapping, res, respos);
8396 if (x==enc_EXCEPTION) {
8397 return -1;
8398 }
8399 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008400 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 return -1;
8402 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008403 }
8404 /* fall through */
8405 case 3: /* ignore */
8406 *inpos = collendpos;
8407 break;
8408 case 4: /* xmlcharrefreplace */
8409 /* generate replacement (temporarily (mis)uses p) */
8410 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 char buffer[2+29+1+1];
8412 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008413 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 for (cp = buffer; *cp; ++cp) {
8415 x = charmapencode_output(*cp, mapping, res, respos);
8416 if (x==enc_EXCEPTION)
8417 return -1;
8418 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008419 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 return -1;
8421 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008422 }
8423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 *inpos = collendpos;
8425 break;
8426 default:
8427 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008428 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008430 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008432 if (PyBytes_Check(repunicode)) {
8433 /* Directly copy bytes result to output. */
8434 Py_ssize_t outsize = PyBytes_Size(*res);
8435 Py_ssize_t requiredsize;
8436 repsize = PyBytes_Size(repunicode);
8437 requiredsize = *respos + repsize;
8438 if (requiredsize > outsize)
8439 /* Make room for all additional bytes. */
8440 if (charmapencode_resize(res, respos, requiredsize)) {
8441 Py_DECREF(repunicode);
8442 return -1;
8443 }
8444 memcpy(PyBytes_AsString(*res) + *respos,
8445 PyBytes_AsString(repunicode), repsize);
8446 *respos += repsize;
8447 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008448 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008449 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008450 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008451 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008452 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008453 Py_DECREF(repunicode);
8454 return -1;
8455 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008456 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008457 data = PyUnicode_DATA(repunicode);
8458 kind = PyUnicode_KIND(repunicode);
8459 for (index = 0; index < repsize; index++) {
8460 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8461 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008463 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 return -1;
8465 }
8466 else if (x==enc_FAILED) {
8467 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008468 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 return -1;
8470 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008471 }
8472 *inpos = newpos;
8473 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474 }
8475 return 0;
8476}
8477
Alexander Belopolsky40018472011-02-26 01:02:56 +00008478PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008479_PyUnicode_EncodeCharmap(PyObject *unicode,
8480 PyObject *mapping,
8481 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008483 /* output object */
8484 PyObject *res = NULL;
8485 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008486 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008487 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008488 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008489 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 PyObject *errorHandler = NULL;
8491 PyObject *exc = NULL;
8492 /* the following variable is used for caching string comparisons
8493 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8494 * 3=ignore, 4=xmlcharrefreplace */
8495 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496
Benjamin Petersonbac79492012-01-14 13:34:47 -05008497 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008498 return NULL;
8499 size = PyUnicode_GET_LENGTH(unicode);
8500
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501 /* Default to Latin-1 */
8502 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008503 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505 /* allocate enough for a simple encoding without
8506 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008507 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508 if (res == NULL)
8509 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008510 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008514 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008516 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 if (x==enc_EXCEPTION) /* error */
8518 goto onError;
8519 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008520 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 &exc,
8522 &known_errorHandler, &errorHandler, errors,
8523 &res, &respos)) {
8524 goto onError;
8525 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008526 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 else
8528 /* done with this character => adjust input position */
8529 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008533 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008534 if (_PyBytes_Resize(&res, respos) < 0)
8535 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008536
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 Py_XDECREF(exc);
8538 Py_XDECREF(errorHandler);
8539 return res;
8540
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008542 Py_XDECREF(res);
8543 Py_XDECREF(exc);
8544 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545 return NULL;
8546}
8547
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008548/* Deprecated */
8549PyObject *
8550PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8551 Py_ssize_t size,
8552 PyObject *mapping,
8553 const char *errors)
8554{
8555 PyObject *result;
8556 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8557 if (unicode == NULL)
8558 return NULL;
8559 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8560 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008561 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008562}
8563
Alexander Belopolsky40018472011-02-26 01:02:56 +00008564PyObject *
8565PyUnicode_AsCharmapString(PyObject *unicode,
8566 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567{
8568 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 PyErr_BadArgument();
8570 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008572 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573}
8574
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008576static void
8577make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008579 Py_ssize_t startpos, Py_ssize_t endpos,
8580 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 *exceptionObject = _PyUnicodeTranslateError_Create(
8584 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 }
8586 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8588 goto onError;
8589 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8590 goto onError;
8591 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8592 goto onError;
8593 return;
8594 onError:
8595 Py_DECREF(*exceptionObject);
8596 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597 }
8598}
8599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008601static void
8602raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008604 Py_ssize_t startpos, Py_ssize_t endpos,
8605 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008606{
8607 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008609 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008611}
8612
8613/* error handling callback helper:
8614 build arguments, call the callback and check the arguments,
8615 put the result into newpos and return the replacement string, which
8616 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008617static PyObject *
8618unicode_translate_call_errorhandler(const char *errors,
8619 PyObject **errorHandler,
8620 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008622 Py_ssize_t startpos, Py_ssize_t endpos,
8623 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008624{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008625 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008627 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628 PyObject *restuple;
8629 PyObject *resunicode;
8630
8631 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635 }
8636
8637 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008641
8642 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008647 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 Py_DECREF(restuple);
8649 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650 }
8651 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 &resunicode, &i_newpos)) {
8653 Py_DECREF(restuple);
8654 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008655 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008656 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008658 else
8659 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8662 Py_DECREF(restuple);
8663 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008664 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 Py_INCREF(resunicode);
8666 Py_DECREF(restuple);
8667 return resunicode;
8668}
8669
8670/* Lookup the character ch in the mapping and put the result in result,
8671 which must be decrefed by the caller.
8672 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008673static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675{
Christian Heimes217cfd12007-12-02 14:31:20 +00008676 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677 PyObject *x;
8678
8679 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 x = PyObject_GetItem(mapping, w);
8682 Py_DECREF(w);
8683 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8685 /* No mapping found means: use 1:1 mapping. */
8686 PyErr_Clear();
8687 *result = NULL;
8688 return 0;
8689 } else
8690 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 }
8692 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 *result = x;
8694 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008696 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 long value = PyLong_AS_LONG(x);
8698 long max = PyUnicode_GetMax();
8699 if (value < 0 || value > max) {
8700 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008701 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 Py_DECREF(x);
8703 return -1;
8704 }
8705 *result = x;
8706 return 0;
8707 }
8708 else if (PyUnicode_Check(x)) {
8709 *result = x;
8710 return 0;
8711 }
8712 else {
8713 /* wrong return value */
8714 PyErr_SetString(PyExc_TypeError,
8715 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008716 Py_DECREF(x);
8717 return -1;
8718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719}
8720/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 if not reallocate and adjust various state variables.
8722 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008723static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008726{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008728 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 /* exponentially overallocate to minimize reallocations */
8730 if (requiredsize < 2 * oldsize)
8731 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8733 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008736 }
8737 return 0;
8738}
8739/* lookup the character, put the result in the output string and adjust
8740 various state variables. Return a new reference to the object that
8741 was put in the output buffer in *result, or Py_None, if the mapping was
8742 undefined (in which case no character was written).
8743 The called must decref result.
8744 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008745static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8747 PyObject *mapping, Py_UCS4 **output,
8748 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008749 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008750{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8752 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008754 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757 }
8758 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008760 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763 }
8764 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 Py_ssize_t repsize;
8766 if (PyUnicode_READY(*res) == -1)
8767 return -1;
8768 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 if (repsize==1) {
8770 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008771 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 }
8773 else if (repsize!=0) {
8774 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 Py_ssize_t requiredsize = *opos +
8776 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 Py_ssize_t i;
8779 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008781 for(i = 0; i < repsize; i++)
8782 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008784 }
8785 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008787 return 0;
8788}
8789
Alexander Belopolsky40018472011-02-26 01:02:56 +00008790PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791_PyUnicode_TranslateCharmap(PyObject *input,
8792 PyObject *mapping,
8793 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008795 /* input object */
8796 char *idata;
8797 Py_ssize_t size, i;
8798 int kind;
8799 /* output buffer */
8800 Py_UCS4 *output = NULL;
8801 Py_ssize_t osize;
8802 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008803 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008805 char *reason = "character maps to <undefined>";
8806 PyObject *errorHandler = NULL;
8807 PyObject *exc = NULL;
8808 /* the following variable is used for caching string comparisons
8809 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8810 * 3=ignore, 4=xmlcharrefreplace */
8811 int known_errorHandler = -1;
8812
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 PyErr_BadArgument();
8815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008818 if (PyUnicode_READY(input) == -1)
8819 return NULL;
8820 idata = (char*)PyUnicode_DATA(input);
8821 kind = PyUnicode_KIND(input);
8822 size = PyUnicode_GET_LENGTH(input);
8823 i = 0;
8824
8825 if (size == 0) {
8826 Py_INCREF(input);
8827 return input;
8828 }
8829
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008830 /* allocate enough for a simple 1:1 translation without
8831 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 osize = size;
8833 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8834 opos = 0;
8835 if (output == NULL) {
8836 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 /* try to encode it */
8842 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008843 if (charmaptranslate_output(input, i, mapping,
8844 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 Py_XDECREF(x);
8846 goto onError;
8847 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008848 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008849 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 else { /* untranslatable character */
8852 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8853 Py_ssize_t repsize;
8854 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008856 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 Py_ssize_t collstart = i;
8858 Py_ssize_t collend = i+1;
8859 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862 while (collend < size) {
8863 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 goto onError;
8865 Py_XDECREF(x);
8866 if (x!=Py_None)
8867 break;
8868 ++collend;
8869 }
8870 /* cache callback name lookup
8871 * (if not done yet, i.e. it's the first error) */
8872 if (known_errorHandler==-1) {
8873 if ((errors==NULL) || (!strcmp(errors, "strict")))
8874 known_errorHandler = 1;
8875 else if (!strcmp(errors, "replace"))
8876 known_errorHandler = 2;
8877 else if (!strcmp(errors, "ignore"))
8878 known_errorHandler = 3;
8879 else if (!strcmp(errors, "xmlcharrefreplace"))
8880 known_errorHandler = 4;
8881 else
8882 known_errorHandler = 0;
8883 }
8884 switch (known_errorHandler) {
8885 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 raise_translate_exception(&exc, input, collstart,
8887 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008888 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 case 2: /* replace */
8890 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 for (coll = collstart; coll<collend; coll++)
8892 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 /* fall through */
8894 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008896 break;
8897 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 /* generate replacement (temporarily (mis)uses i) */
8899 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 char buffer[2+29+1+1];
8901 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8903 if (charmaptranslate_makespace(&output, &osize,
8904 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008905 goto onError;
8906 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 break;
8911 default:
8912 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 reason, input, &exc,
8914 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008915 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008917 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008918 Py_DECREF(repunicode);
8919 goto onError;
8920 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 repsize = PyUnicode_GET_LENGTH(repunicode);
8923 if (charmaptranslate_makespace(&output, &osize,
8924 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 Py_DECREF(repunicode);
8926 goto onError;
8927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 for (uni2 = 0; repsize-->0; ++uni2)
8929 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8930 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008932 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008933 }
8934 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8936 if (!res)
8937 goto onError;
8938 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008939 Py_XDECREF(exc);
8940 Py_XDECREF(errorHandler);
8941 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008945 Py_XDECREF(exc);
8946 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947 return NULL;
8948}
8949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950/* Deprecated. Use PyUnicode_Translate instead. */
8951PyObject *
8952PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8953 Py_ssize_t size,
8954 PyObject *mapping,
8955 const char *errors)
8956{
8957 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8958 if (!unicode)
8959 return NULL;
8960 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8961}
8962
Alexander Belopolsky40018472011-02-26 01:02:56 +00008963PyObject *
8964PyUnicode_Translate(PyObject *str,
8965 PyObject *mapping,
8966 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967{
8968 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008969
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970 str = PyUnicode_FromObject(str);
8971 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974 Py_DECREF(str);
8975 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008976
Benjamin Peterson29060642009-01-31 22:14:21 +00008977 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 Py_XDECREF(str);
8979 return NULL;
8980}
Tim Petersced69f82003-09-16 20:30:58 +00008981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008983fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984{
8985 /* No need to call PyUnicode_READY(self) because this function is only
8986 called as a callback from fixup() which does it already. */
8987 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8988 const int kind = PyUnicode_KIND(self);
8989 void *data = PyUnicode_DATA(self);
8990 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008991 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992 Py_ssize_t i;
8993
8994 for (i = 0; i < len; ++i) {
8995 ch = PyUnicode_READ(kind, data, i);
8996 fixed = 0;
8997 if (ch > 127) {
8998 if (Py_UNICODE_ISSPACE(ch))
8999 fixed = ' ';
9000 else {
9001 const int decimal = Py_UNICODE_TODECIMAL(ch);
9002 if (decimal >= 0)
9003 fixed = '0' + decimal;
9004 }
9005 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009006 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007 if (fixed > maxchar)
9008 maxchar = fixed;
9009 PyUnicode_WRITE(kind, data, i, fixed);
9010 }
9011 else if (ch > maxchar)
9012 maxchar = ch;
9013 }
9014 else if (ch > maxchar)
9015 maxchar = ch;
9016 }
9017
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009018 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019}
9020
9021PyObject *
9022_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9023{
9024 if (!PyUnicode_Check(unicode)) {
9025 PyErr_BadInternalCall();
9026 return NULL;
9027 }
9028 if (PyUnicode_READY(unicode) == -1)
9029 return NULL;
9030 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9031 /* If the string is already ASCII, just return the same string */
9032 Py_INCREF(unicode);
9033 return unicode;
9034 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009035 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036}
9037
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009038PyObject *
9039PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9040 Py_ssize_t length)
9041{
Victor Stinnerf0124502011-11-21 23:12:56 +01009042 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009043 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009044 Py_UCS4 maxchar;
9045 enum PyUnicode_Kind kind;
9046 void *data;
9047
Victor Stinner99d7ad02012-02-22 13:37:39 +01009048 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009049 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009050 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009051 if (ch > 127) {
9052 int decimal = Py_UNICODE_TODECIMAL(ch);
9053 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009054 ch = '0' + decimal;
Victor Stinner99d7ad02012-02-22 13:37:39 +01009055 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009056 }
9057 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009058
9059 /* Copy to a new string */
9060 decimal = PyUnicode_New(length, maxchar);
9061 if (decimal == NULL)
9062 return decimal;
9063 kind = PyUnicode_KIND(decimal);
9064 data = PyUnicode_DATA(decimal);
9065 /* Iterate over code points */
9066 for (i = 0; i < length; i++) {
9067 Py_UNICODE ch = s[i];
9068 if (ch > 127) {
9069 int decimal = Py_UNICODE_TODECIMAL(ch);
9070 if (decimal >= 0)
9071 ch = '0' + decimal;
9072 }
9073 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009075 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009076}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009077/* --- Decimal Encoder ---------------------------------------------------- */
9078
Alexander Belopolsky40018472011-02-26 01:02:56 +00009079int
9080PyUnicode_EncodeDecimal(Py_UNICODE *s,
9081 Py_ssize_t length,
9082 char *output,
9083 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009084{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009085 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009086 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009087 enum PyUnicode_Kind kind;
9088 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009089
9090 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009091 PyErr_BadArgument();
9092 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009093 }
9094
Victor Stinner42bf7752011-11-21 22:52:58 +01009095 unicode = PyUnicode_FromUnicode(s, length);
9096 if (unicode == NULL)
9097 return -1;
9098
Benjamin Petersonbac79492012-01-14 13:34:47 -05009099 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009100 Py_DECREF(unicode);
9101 return -1;
9102 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009103 kind = PyUnicode_KIND(unicode);
9104 data = PyUnicode_DATA(unicode);
9105
Victor Stinnerb84d7232011-11-22 01:50:07 +01009106 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009107 PyObject *exc;
9108 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009109 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009110 Py_ssize_t startpos;
9111
9112 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009113
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009115 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009116 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009118 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 decimal = Py_UNICODE_TODECIMAL(ch);
9120 if (decimal >= 0) {
9121 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009122 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 continue;
9124 }
9125 if (0 < ch && ch < 256) {
9126 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009127 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 continue;
9129 }
Victor Stinner6345be92011-11-25 20:09:01 +01009130
Victor Stinner42bf7752011-11-21 22:52:58 +01009131 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009132 exc = NULL;
9133 raise_encode_exception(&exc, "decimal", unicode,
9134 startpos, startpos+1,
9135 "invalid decimal Unicode string");
9136 Py_XDECREF(exc);
9137 Py_DECREF(unicode);
9138 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009139 }
9140 /* 0-terminate the output string */
9141 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009142 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009143 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009144}
9145
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146/* --- Helpers ------------------------------------------------------------ */
9147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009149any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 Py_ssize_t start,
9151 Py_ssize_t end)
9152{
9153 int kind1, kind2, kind;
9154 void *buf1, *buf2;
9155 Py_ssize_t len1, len2, result;
9156
9157 kind1 = PyUnicode_KIND(s1);
9158 kind2 = PyUnicode_KIND(s2);
9159 kind = kind1 > kind2 ? kind1 : kind2;
9160 buf1 = PyUnicode_DATA(s1);
9161 buf2 = PyUnicode_DATA(s2);
9162 if (kind1 != kind)
9163 buf1 = _PyUnicode_AsKind(s1, kind);
9164 if (!buf1)
9165 return -2;
9166 if (kind2 != kind)
9167 buf2 = _PyUnicode_AsKind(s2, kind);
9168 if (!buf2) {
9169 if (kind1 != kind) PyMem_Free(buf1);
9170 return -2;
9171 }
9172 len1 = PyUnicode_GET_LENGTH(s1);
9173 len2 = PyUnicode_GET_LENGTH(s2);
9174
Victor Stinner794d5672011-10-10 03:21:36 +02009175 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009176 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009177 case PyUnicode_1BYTE_KIND:
9178 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9179 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9180 else
9181 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9182 break;
9183 case PyUnicode_2BYTE_KIND:
9184 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9185 break;
9186 case PyUnicode_4BYTE_KIND:
9187 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9188 break;
9189 default:
9190 assert(0); result = -2;
9191 }
9192 }
9193 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009194 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009195 case PyUnicode_1BYTE_KIND:
9196 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9197 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9198 else
9199 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9200 break;
9201 case PyUnicode_2BYTE_KIND:
9202 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9203 break;
9204 case PyUnicode_4BYTE_KIND:
9205 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9206 break;
9207 default:
9208 assert(0); result = -2;
9209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009210 }
9211
9212 if (kind1 != kind)
9213 PyMem_Free(buf1);
9214 if (kind2 != kind)
9215 PyMem_Free(buf2);
9216
9217 return result;
9218}
9219
9220Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009221_PyUnicode_InsertThousandsGrouping(
9222 PyObject *unicode, Py_ssize_t index,
9223 Py_ssize_t n_buffer,
9224 void *digits, Py_ssize_t n_digits,
9225 Py_ssize_t min_width,
9226 const char *grouping, PyObject *thousands_sep,
9227 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228{
Victor Stinner41a863c2012-02-24 00:37:51 +01009229 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009230 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009231 Py_ssize_t thousands_sep_len;
9232 Py_ssize_t len;
9233
9234 if (unicode != NULL) {
9235 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009236 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009237 }
9238 else {
9239 kind = PyUnicode_1BYTE_KIND;
9240 data = NULL;
9241 }
9242 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9243 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9244 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9245 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009246 if (thousands_sep_kind < kind) {
9247 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9248 if (!thousands_sep_data)
9249 return -1;
9250 }
9251 else {
9252 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9253 if (!data)
9254 return -1;
9255 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009256 }
9257
Benjamin Petersonead6b532011-12-20 17:23:42 -06009258 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009260 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009261 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009262 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009263 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009264 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009265 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009266 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009267 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009268 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009269 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009270 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009272 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009273 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009274 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009275 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009276 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009278 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009279 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009280 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009281 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009282 break;
9283 default:
9284 assert(0);
9285 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009287 if (unicode != NULL && thousands_sep_kind != kind) {
9288 if (thousands_sep_kind < kind)
9289 PyMem_Free(thousands_sep_data);
9290 else
9291 PyMem_Free(data);
9292 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009293 if (unicode == NULL) {
9294 *maxchar = 127;
9295 if (len != n_digits) {
9296 *maxchar = Py_MAX(*maxchar,
9297 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9298 }
9299 }
9300 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301}
9302
9303
Thomas Wouters477c8d52006-05-27 19:21:47 +00009304/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009305#define ADJUST_INDICES(start, end, len) \
9306 if (end > len) \
9307 end = len; \
9308 else if (end < 0) { \
9309 end += len; \
9310 if (end < 0) \
9311 end = 0; \
9312 } \
9313 if (start < 0) { \
9314 start += len; \
9315 if (start < 0) \
9316 start = 0; \
9317 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009318
Alexander Belopolsky40018472011-02-26 01:02:56 +00009319Py_ssize_t
9320PyUnicode_Count(PyObject *str,
9321 PyObject *substr,
9322 Py_ssize_t start,
9323 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009325 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009326 PyObject* str_obj;
9327 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 int kind1, kind2, kind;
9329 void *buf1 = NULL, *buf2 = NULL;
9330 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009331
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009332 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009333 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009334 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009335 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009336 if (!sub_obj) {
9337 Py_DECREF(str_obj);
9338 return -1;
9339 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009340 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009341 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 Py_DECREF(str_obj);
9343 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344 }
Tim Petersced69f82003-09-16 20:30:58 +00009345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 kind1 = PyUnicode_KIND(str_obj);
9347 kind2 = PyUnicode_KIND(sub_obj);
9348 kind = kind1 > kind2 ? kind1 : kind2;
9349 buf1 = PyUnicode_DATA(str_obj);
9350 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009351 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 if (!buf1)
9353 goto onError;
9354 buf2 = PyUnicode_DATA(sub_obj);
9355 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009356 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 if (!buf2)
9358 goto onError;
9359 len1 = PyUnicode_GET_LENGTH(str_obj);
9360 len2 = PyUnicode_GET_LENGTH(sub_obj);
9361
9362 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009363 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009365 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9366 result = asciilib_count(
9367 ((Py_UCS1*)buf1) + start, end - start,
9368 buf2, len2, PY_SSIZE_T_MAX
9369 );
9370 else
9371 result = ucs1lib_count(
9372 ((Py_UCS1*)buf1) + start, end - start,
9373 buf2, len2, PY_SSIZE_T_MAX
9374 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375 break;
9376 case PyUnicode_2BYTE_KIND:
9377 result = ucs2lib_count(
9378 ((Py_UCS2*)buf1) + start, end - start,
9379 buf2, len2, PY_SSIZE_T_MAX
9380 );
9381 break;
9382 case PyUnicode_4BYTE_KIND:
9383 result = ucs4lib_count(
9384 ((Py_UCS4*)buf1) + start, end - start,
9385 buf2, len2, PY_SSIZE_T_MAX
9386 );
9387 break;
9388 default:
9389 assert(0); result = 0;
9390 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009391
9392 Py_DECREF(sub_obj);
9393 Py_DECREF(str_obj);
9394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 if (kind1 != kind)
9396 PyMem_Free(buf1);
9397 if (kind2 != kind)
9398 PyMem_Free(buf2);
9399
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 onError:
9402 Py_DECREF(sub_obj);
9403 Py_DECREF(str_obj);
9404 if (kind1 != kind && buf1)
9405 PyMem_Free(buf1);
9406 if (kind2 != kind && buf2)
9407 PyMem_Free(buf2);
9408 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409}
9410
Alexander Belopolsky40018472011-02-26 01:02:56 +00009411Py_ssize_t
9412PyUnicode_Find(PyObject *str,
9413 PyObject *sub,
9414 Py_ssize_t start,
9415 Py_ssize_t end,
9416 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009418 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009419
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009421 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009422 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009423 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009424 if (!sub) {
9425 Py_DECREF(str);
9426 return -2;
9427 }
9428 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9429 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009430 Py_DECREF(str);
9431 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432 }
Tim Petersced69f82003-09-16 20:30:58 +00009433
Victor Stinner794d5672011-10-10 03:21:36 +02009434 result = any_find_slice(direction,
9435 str, sub, start, end
9436 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009437
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009439 Py_DECREF(sub);
9440
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441 return result;
9442}
9443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444Py_ssize_t
9445PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9446 Py_ssize_t start, Py_ssize_t end,
9447 int direction)
9448{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009450 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 if (PyUnicode_READY(str) == -1)
9452 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009453 if (start < 0 || end < 0) {
9454 PyErr_SetString(PyExc_IndexError, "string index out of range");
9455 return -2;
9456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 if (end > PyUnicode_GET_LENGTH(str))
9458 end = PyUnicode_GET_LENGTH(str);
9459 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009460 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9461 kind, end-start, ch, direction);
9462 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009464 else
9465 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466}
9467
Alexander Belopolsky40018472011-02-26 01:02:56 +00009468static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009469tailmatch(PyObject *self,
9470 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009471 Py_ssize_t start,
9472 Py_ssize_t end,
9473 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 int kind_self;
9476 int kind_sub;
9477 void *data_self;
9478 void *data_sub;
9479 Py_ssize_t offset;
9480 Py_ssize_t i;
9481 Py_ssize_t end_sub;
9482
9483 if (PyUnicode_READY(self) == -1 ||
9484 PyUnicode_READY(substring) == -1)
9485 return 0;
9486
9487 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488 return 1;
9489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9491 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009493 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 kind_self = PyUnicode_KIND(self);
9496 data_self = PyUnicode_DATA(self);
9497 kind_sub = PyUnicode_KIND(substring);
9498 data_sub = PyUnicode_DATA(substring);
9499 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9500
9501 if (direction > 0)
9502 offset = end;
9503 else
9504 offset = start;
9505
9506 if (PyUnicode_READ(kind_self, data_self, offset) ==
9507 PyUnicode_READ(kind_sub, data_sub, 0) &&
9508 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9509 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9510 /* If both are of the same kind, memcmp is sufficient */
9511 if (kind_self == kind_sub) {
9512 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009513 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 data_sub,
9515 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009516 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 }
9518 /* otherwise we have to compare each character by first accesing it */
9519 else {
9520 /* We do not need to compare 0 and len(substring)-1 because
9521 the if statement above ensured already that they are equal
9522 when we end up here. */
9523 // TODO: honor direction and do a forward or backwards search
9524 for (i = 1; i < end_sub; ++i) {
9525 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9526 PyUnicode_READ(kind_sub, data_sub, i))
9527 return 0;
9528 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531 }
9532
9533 return 0;
9534}
9535
Alexander Belopolsky40018472011-02-26 01:02:56 +00009536Py_ssize_t
9537PyUnicode_Tailmatch(PyObject *str,
9538 PyObject *substr,
9539 Py_ssize_t start,
9540 Py_ssize_t end,
9541 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009543 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009544
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545 str = PyUnicode_FromObject(str);
9546 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548 substr = PyUnicode_FromObject(substr);
9549 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 Py_DECREF(str);
9551 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552 }
Tim Petersced69f82003-09-16 20:30:58 +00009553
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009554 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556 Py_DECREF(str);
9557 Py_DECREF(substr);
9558 return result;
9559}
9560
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561/* Apply fixfct filter to the Unicode object self and return a
9562 reference to the modified object */
9563
Alexander Belopolsky40018472011-02-26 01:02:56 +00009564static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009565fixup(PyObject *self,
9566 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 PyObject *u;
9569 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009570 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009572 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009575 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 /* fix functions return the new maximum character in a string,
9578 if the kind of the resulting unicode object does not change,
9579 everything is fine. Otherwise we need to change the string kind
9580 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009581 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009582
9583 if (maxchar_new == 0) {
9584 /* no changes */;
9585 if (PyUnicode_CheckExact(self)) {
9586 Py_DECREF(u);
9587 Py_INCREF(self);
9588 return self;
9589 }
9590 else
9591 return u;
9592 }
9593
9594 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 maxchar_new = 127;
9596 else if (maxchar_new <= 255)
9597 maxchar_new = 255;
9598 else if (maxchar_new <= 65535)
9599 maxchar_new = 65535;
9600 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009601 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602
Victor Stinnereaab6042011-12-11 22:22:39 +01009603 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009605
9606 /* In case the maximum character changed, we need to
9607 convert the string to the new category. */
9608 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9609 if (v == NULL) {
9610 Py_DECREF(u);
9611 return NULL;
9612 }
9613 if (maxchar_new > maxchar_old) {
9614 /* If the maxchar increased so that the kind changed, not all
9615 characters are representable anymore and we need to fix the
9616 string again. This only happens in very few cases. */
9617 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9618 maxchar_old = fixfct(v);
9619 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 }
9621 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009622 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009624 Py_DECREF(u);
9625 assert(_PyUnicode_CheckConsistency(v, 1));
9626 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627}
9628
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009629static PyObject *
9630ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009632 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9633 char *resdata, *data = PyUnicode_DATA(self);
9634 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009635
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009636 res = PyUnicode_New(len, 127);
9637 if (res == NULL)
9638 return NULL;
9639 resdata = PyUnicode_DATA(res);
9640 if (lower)
9641 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009643 _Py_bytes_upper(resdata, data, len);
9644 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645}
9646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009648handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650 Py_ssize_t j;
9651 int final_sigma;
9652 Py_UCS4 c;
9653 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009654
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9656
9657 where ! is a negation and \p{xxx} is a character with property xxx.
9658 */
9659 for (j = i - 1; j >= 0; j--) {
9660 c = PyUnicode_READ(kind, data, j);
9661 if (!_PyUnicode_IsCaseIgnorable(c))
9662 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009664 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9665 if (final_sigma) {
9666 for (j = i + 1; j < length; j++) {
9667 c = PyUnicode_READ(kind, data, j);
9668 if (!_PyUnicode_IsCaseIgnorable(c))
9669 break;
9670 }
9671 final_sigma = j == length || !_PyUnicode_IsCased(c);
9672 }
9673 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674}
9675
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676static int
9677lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9678 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 /* Obscure special case. */
9681 if (c == 0x3A3) {
9682 mapped[0] = handle_capital_sigma(kind, data, length, i);
9683 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009685 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686}
9687
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688static Py_ssize_t
9689do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009690{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009691 Py_ssize_t i, k = 0;
9692 int n_res, j;
9693 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009694
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009695 c = PyUnicode_READ(kind, data, 0);
9696 n_res = _PyUnicode_ToUpperFull(c, mapped);
9697 for (j = 0; j < n_res; j++) {
9698 if (mapped[j] > *maxchar)
9699 *maxchar = mapped[j];
9700 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009701 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009702 for (i = 1; i < length; i++) {
9703 c = PyUnicode_READ(kind, data, i);
9704 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9705 for (j = 0; j < n_res; j++) {
9706 if (mapped[j] > *maxchar)
9707 *maxchar = mapped[j];
9708 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009709 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009710 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712}
9713
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714static Py_ssize_t
9715do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9716 Py_ssize_t i, k = 0;
9717
9718 for (i = 0; i < length; i++) {
9719 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9720 int n_res, j;
9721 if (Py_UNICODE_ISUPPER(c)) {
9722 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9723 }
9724 else if (Py_UNICODE_ISLOWER(c)) {
9725 n_res = _PyUnicode_ToUpperFull(c, mapped);
9726 }
9727 else {
9728 n_res = 1;
9729 mapped[0] = c;
9730 }
9731 for (j = 0; j < n_res; j++) {
9732 if (mapped[j] > *maxchar)
9733 *maxchar = mapped[j];
9734 res[k++] = mapped[j];
9735 }
9736 }
9737 return k;
9738}
9739
9740static Py_ssize_t
9741do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9742 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009744 Py_ssize_t i, k = 0;
9745
9746 for (i = 0; i < length; i++) {
9747 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9748 int n_res, j;
9749 if (lower)
9750 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9751 else
9752 n_res = _PyUnicode_ToUpperFull(c, mapped);
9753 for (j = 0; j < n_res; j++) {
9754 if (mapped[j] > *maxchar)
9755 *maxchar = mapped[j];
9756 res[k++] = mapped[j];
9757 }
9758 }
9759 return k;
9760}
9761
9762static Py_ssize_t
9763do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9764{
9765 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9766}
9767
9768static Py_ssize_t
9769do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9770{
9771 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9772}
9773
Benjamin Petersone51757f2012-01-12 21:10:29 -05009774static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009775do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9776{
9777 Py_ssize_t i, k = 0;
9778
9779 for (i = 0; i < length; i++) {
9780 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9781 Py_UCS4 mapped[3];
9782 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9783 for (j = 0; j < n_res; j++) {
9784 if (mapped[j] > *maxchar)
9785 *maxchar = mapped[j];
9786 res[k++] = mapped[j];
9787 }
9788 }
9789 return k;
9790}
9791
9792static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009793do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9794{
9795 Py_ssize_t i, k = 0;
9796 int previous_is_cased;
9797
9798 previous_is_cased = 0;
9799 for (i = 0; i < length; i++) {
9800 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9801 Py_UCS4 mapped[3];
9802 int n_res, j;
9803
9804 if (previous_is_cased)
9805 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9806 else
9807 n_res = _PyUnicode_ToTitleFull(c, mapped);
9808
9809 for (j = 0; j < n_res; j++) {
9810 if (mapped[j] > *maxchar)
9811 *maxchar = mapped[j];
9812 res[k++] = mapped[j];
9813 }
9814
9815 previous_is_cased = _PyUnicode_IsCased(c);
9816 }
9817 return k;
9818}
9819
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009820static PyObject *
9821case_operation(PyObject *self,
9822 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9823{
9824 PyObject *res = NULL;
9825 Py_ssize_t length, newlength = 0;
9826 int kind, outkind;
9827 void *data, *outdata;
9828 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9829
Benjamin Petersoneea48462012-01-16 14:28:50 -05009830 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009831
9832 kind = PyUnicode_KIND(self);
9833 data = PyUnicode_DATA(self);
9834 length = PyUnicode_GET_LENGTH(self);
9835 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9836 if (tmp == NULL)
9837 return PyErr_NoMemory();
9838 newlength = perform(kind, data, length, tmp, &maxchar);
9839 res = PyUnicode_New(newlength, maxchar);
9840 if (res == NULL)
9841 goto leave;
9842 tmpend = tmp + newlength;
9843 outdata = PyUnicode_DATA(res);
9844 outkind = PyUnicode_KIND(res);
9845 switch (outkind) {
9846 case PyUnicode_1BYTE_KIND:
9847 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9848 break;
9849 case PyUnicode_2BYTE_KIND:
9850 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9851 break;
9852 case PyUnicode_4BYTE_KIND:
9853 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9854 break;
9855 default:
9856 assert(0);
9857 break;
9858 }
9859 leave:
9860 PyMem_FREE(tmp);
9861 return res;
9862}
9863
Tim Peters8ce9f162004-08-27 01:49:32 +00009864PyObject *
9865PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009868 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009870 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009871 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9872 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009873 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009875 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009877 int use_memcpy;
9878 unsigned char *res_data = NULL, *sep_data = NULL;
9879 PyObject *last_obj;
9880 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881
Tim Peters05eba1f2004-08-27 21:32:02 +00009882 fseq = PySequence_Fast(seq, "");
9883 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009884 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009885 }
9886
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009887 /* NOTE: the following code can't call back into Python code,
9888 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009889 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009890
Tim Peters05eba1f2004-08-27 21:32:02 +00009891 seqlen = PySequence_Fast_GET_SIZE(fseq);
9892 /* If empty sequence, return u"". */
9893 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009894 Py_DECREF(fseq);
9895 Py_INCREF(unicode_empty);
9896 res = unicode_empty;
9897 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009898 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009899
Tim Peters05eba1f2004-08-27 21:32:02 +00009900 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009901 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009902 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009903 if (seqlen == 1) {
9904 if (PyUnicode_CheckExact(items[0])) {
9905 res = items[0];
9906 Py_INCREF(res);
9907 Py_DECREF(fseq);
9908 return res;
9909 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009910 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009911 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009912 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009913 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009914 /* Set up sep and seplen */
9915 if (separator == NULL) {
9916 /* fall back to a blank space separator */
9917 sep = PyUnicode_FromOrdinal(' ');
9918 if (!sep)
9919 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009920 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009921 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009922 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009923 else {
9924 if (!PyUnicode_Check(separator)) {
9925 PyErr_Format(PyExc_TypeError,
9926 "separator: expected str instance,"
9927 " %.80s found",
9928 Py_TYPE(separator)->tp_name);
9929 goto onError;
9930 }
9931 if (PyUnicode_READY(separator))
9932 goto onError;
9933 sep = separator;
9934 seplen = PyUnicode_GET_LENGTH(separator);
9935 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9936 /* inc refcount to keep this code path symmetric with the
9937 above case of a blank separator */
9938 Py_INCREF(sep);
9939 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009940 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009941 }
9942
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009943 /* There are at least two things to join, or else we have a subclass
9944 * of str in the sequence.
9945 * Do a pre-pass to figure out the total amount of space we'll
9946 * need (sz), and see whether all argument are strings.
9947 */
9948 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009949#ifdef Py_DEBUG
9950 use_memcpy = 0;
9951#else
9952 use_memcpy = 1;
9953#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009954 for (i = 0; i < seqlen; i++) {
9955 const Py_ssize_t old_sz = sz;
9956 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009957 if (!PyUnicode_Check(item)) {
9958 PyErr_Format(PyExc_TypeError,
9959 "sequence item %zd: expected str instance,"
9960 " %.80s found",
9961 i, Py_TYPE(item)->tp_name);
9962 goto onError;
9963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 if (PyUnicode_READY(item) == -1)
9965 goto onError;
9966 sz += PyUnicode_GET_LENGTH(item);
9967 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009968 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009969 if (i != 0)
9970 sz += seplen;
9971 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9972 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009973 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009974 goto onError;
9975 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009976 if (use_memcpy && last_obj != NULL) {
9977 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9978 use_memcpy = 0;
9979 }
9980 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009981 }
Tim Petersced69f82003-09-16 20:30:58 +00009982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009984 if (res == NULL)
9985 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009986
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009987 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009988#ifdef Py_DEBUG
9989 use_memcpy = 0;
9990#else
9991 if (use_memcpy) {
9992 res_data = PyUnicode_1BYTE_DATA(res);
9993 kind = PyUnicode_KIND(res);
9994 if (seplen != 0)
9995 sep_data = PyUnicode_1BYTE_DATA(sep);
9996 }
9997#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009999 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010000 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010001 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +020010002 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010003 if (use_memcpy) {
10004 Py_MEMCPY(res_data,
10005 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010006 kind * seplen);
10007 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010008 }
10009 else {
10010 copy_characters(res, res_offset, sep, 0, seplen);
10011 res_offset += seplen;
10012 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010013 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010014 itemlen = PyUnicode_GET_LENGTH(item);
10015 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010016 if (use_memcpy) {
10017 Py_MEMCPY(res_data,
10018 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010019 kind * itemlen);
10020 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010021 }
10022 else {
10023 copy_characters(res, res_offset, item, 0, itemlen);
10024 res_offset += itemlen;
10025 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010026 }
Tim Peters05eba1f2004-08-27 21:32:02 +000010027 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010028 if (use_memcpy)
10029 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010030 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +020010031 else
10032 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +000010033
Tim Peters05eba1f2004-08-27 21:32:02 +000010034 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010036 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038
Benjamin Peterson29060642009-01-31 22:14:21 +000010039 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010040 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010042 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043 return NULL;
10044}
10045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046#define FILL(kind, data, value, start, length) \
10047 do { \
10048 Py_ssize_t i_ = 0; \
10049 assert(kind != PyUnicode_WCHAR_KIND); \
10050 switch ((kind)) { \
10051 case PyUnicode_1BYTE_KIND: { \
10052 unsigned char * to_ = (unsigned char *)((data)) + (start); \
10053 memset(to_, (unsigned char)value, length); \
10054 break; \
10055 } \
10056 case PyUnicode_2BYTE_KIND: { \
10057 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10058 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10059 break; \
10060 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010061 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10063 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10064 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010065 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 } \
10067 } \
10068 } while (0)
10069
Victor Stinner3fe55312012-01-04 00:33:50 +010010070Py_ssize_t
10071PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10072 Py_UCS4 fill_char)
10073{
10074 Py_ssize_t maxlen;
10075 enum PyUnicode_Kind kind;
10076 void *data;
10077
10078 if (!PyUnicode_Check(unicode)) {
10079 PyErr_BadInternalCall();
10080 return -1;
10081 }
10082 if (PyUnicode_READY(unicode) == -1)
10083 return -1;
10084 if (unicode_check_modifiable(unicode))
10085 return -1;
10086
10087 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10088 PyErr_SetString(PyExc_ValueError,
10089 "fill character is bigger than "
10090 "the string maximum character");
10091 return -1;
10092 }
10093
10094 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10095 length = Py_MIN(maxlen, length);
10096 if (length <= 0)
10097 return 0;
10098
10099 kind = PyUnicode_KIND(unicode);
10100 data = PyUnicode_DATA(unicode);
10101 FILL(kind, data, fill_char, start, length);
10102 return length;
10103}
10104
Victor Stinner9310abb2011-10-05 00:59:23 +020010105static PyObject *
10106pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010107 Py_ssize_t left,
10108 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 PyObject *u;
10112 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010113 int kind;
10114 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115
10116 if (left < 0)
10117 left = 0;
10118 if (right < 0)
10119 right = 0;
10120
Victor Stinnerc4b49542011-12-11 22:44:26 +010010121 if (left == 0 && right == 0)
10122 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10125 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010126 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10127 return NULL;
10128 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10130 if (fill > maxchar)
10131 maxchar = fill;
10132 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010133 if (!u)
10134 return NULL;
10135
10136 kind = PyUnicode_KIND(u);
10137 data = PyUnicode_DATA(u);
10138 if (left)
10139 FILL(kind, data, fill, 0, left);
10140 if (right)
10141 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010142 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010143 assert(_PyUnicode_CheckConsistency(u, 1));
10144 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145}
10146
Alexander Belopolsky40018472011-02-26 01:02:56 +000010147PyObject *
10148PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151
10152 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010153 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010154 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010155 if (PyUnicode_READY(string) == -1) {
10156 Py_DECREF(string);
10157 return NULL;
10158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159
Benjamin Petersonead6b532011-12-20 17:23:42 -060010160 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010162 if (PyUnicode_IS_ASCII(string))
10163 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010164 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010165 PyUnicode_GET_LENGTH(string), keepends);
10166 else
10167 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010168 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010169 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 break;
10171 case PyUnicode_2BYTE_KIND:
10172 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010173 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 PyUnicode_GET_LENGTH(string), keepends);
10175 break;
10176 case PyUnicode_4BYTE_KIND:
10177 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010178 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 PyUnicode_GET_LENGTH(string), keepends);
10180 break;
10181 default:
10182 assert(0);
10183 list = 0;
10184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185 Py_DECREF(string);
10186 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187}
10188
Alexander Belopolsky40018472011-02-26 01:02:56 +000010189static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010190split(PyObject *self,
10191 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010192 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 int kind1, kind2, kind;
10195 void *buf1, *buf2;
10196 Py_ssize_t len1, len2;
10197 PyObject* out;
10198
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010200 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 if (PyUnicode_READY(self) == -1)
10203 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010206 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010208 if (PyUnicode_IS_ASCII(self))
10209 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010210 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010211 PyUnicode_GET_LENGTH(self), maxcount
10212 );
10213 else
10214 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010215 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216 PyUnicode_GET_LENGTH(self), maxcount
10217 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 case PyUnicode_2BYTE_KIND:
10219 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010220 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 PyUnicode_GET_LENGTH(self), maxcount
10222 );
10223 case PyUnicode_4BYTE_KIND:
10224 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010225 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 PyUnicode_GET_LENGTH(self), maxcount
10227 );
10228 default:
10229 assert(0);
10230 return NULL;
10231 }
10232
10233 if (PyUnicode_READY(substring) == -1)
10234 return NULL;
10235
10236 kind1 = PyUnicode_KIND(self);
10237 kind2 = PyUnicode_KIND(substring);
10238 kind = kind1 > kind2 ? kind1 : kind2;
10239 buf1 = PyUnicode_DATA(self);
10240 buf2 = PyUnicode_DATA(substring);
10241 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010242 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 if (!buf1)
10244 return NULL;
10245 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010246 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (!buf2) {
10248 if (kind1 != kind) PyMem_Free(buf1);
10249 return NULL;
10250 }
10251 len1 = PyUnicode_GET_LENGTH(self);
10252 len2 = PyUnicode_GET_LENGTH(substring);
10253
Benjamin Petersonead6b532011-12-20 17:23:42 -060010254 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010256 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10257 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010258 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010259 else
10260 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010261 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 break;
10263 case PyUnicode_2BYTE_KIND:
10264 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010265 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 break;
10267 case PyUnicode_4BYTE_KIND:
10268 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010269 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 break;
10271 default:
10272 out = NULL;
10273 }
10274 if (kind1 != kind)
10275 PyMem_Free(buf1);
10276 if (kind2 != kind)
10277 PyMem_Free(buf2);
10278 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279}
10280
Alexander Belopolsky40018472011-02-26 01:02:56 +000010281static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010282rsplit(PyObject *self,
10283 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010284 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 int kind1, kind2, kind;
10287 void *buf1, *buf2;
10288 Py_ssize_t len1, len2;
10289 PyObject* out;
10290
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010291 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010292 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 if (PyUnicode_READY(self) == -1)
10295 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010298 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010300 if (PyUnicode_IS_ASCII(self))
10301 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010302 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010303 PyUnicode_GET_LENGTH(self), maxcount
10304 );
10305 else
10306 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010307 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010308 PyUnicode_GET_LENGTH(self), maxcount
10309 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 case PyUnicode_2BYTE_KIND:
10311 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010312 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 PyUnicode_GET_LENGTH(self), maxcount
10314 );
10315 case PyUnicode_4BYTE_KIND:
10316 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010317 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 PyUnicode_GET_LENGTH(self), maxcount
10319 );
10320 default:
10321 assert(0);
10322 return NULL;
10323 }
10324
10325 if (PyUnicode_READY(substring) == -1)
10326 return NULL;
10327
10328 kind1 = PyUnicode_KIND(self);
10329 kind2 = PyUnicode_KIND(substring);
10330 kind = kind1 > kind2 ? kind1 : kind2;
10331 buf1 = PyUnicode_DATA(self);
10332 buf2 = PyUnicode_DATA(substring);
10333 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010334 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (!buf1)
10336 return NULL;
10337 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010338 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 if (!buf2) {
10340 if (kind1 != kind) PyMem_Free(buf1);
10341 return NULL;
10342 }
10343 len1 = PyUnicode_GET_LENGTH(self);
10344 len2 = PyUnicode_GET_LENGTH(substring);
10345
Benjamin Petersonead6b532011-12-20 17:23:42 -060010346 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010348 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10349 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010350 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010351 else
10352 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010353 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 break;
10355 case PyUnicode_2BYTE_KIND:
10356 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010357 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 break;
10359 case PyUnicode_4BYTE_KIND:
10360 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010361 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 break;
10363 default:
10364 out = NULL;
10365 }
10366 if (kind1 != kind)
10367 PyMem_Free(buf1);
10368 if (kind2 != kind)
10369 PyMem_Free(buf2);
10370 return out;
10371}
10372
10373static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010374anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10375 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010377 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010379 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10380 return asciilib_find(buf1, len1, buf2, len2, offset);
10381 else
10382 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 case PyUnicode_2BYTE_KIND:
10384 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10385 case PyUnicode_4BYTE_KIND:
10386 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10387 }
10388 assert(0);
10389 return -1;
10390}
10391
10392static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010393anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10394 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010396 switch (kind) {
10397 case PyUnicode_1BYTE_KIND:
10398 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10399 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10400 else
10401 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10402 case PyUnicode_2BYTE_KIND:
10403 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10404 case PyUnicode_4BYTE_KIND:
10405 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10406 }
10407 assert(0);
10408 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010409}
10410
Alexander Belopolsky40018472011-02-26 01:02:56 +000010411static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412replace(PyObject *self, PyObject *str1,
10413 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 PyObject *u;
10416 char *sbuf = PyUnicode_DATA(self);
10417 char *buf1 = PyUnicode_DATA(str1);
10418 char *buf2 = PyUnicode_DATA(str2);
10419 int srelease = 0, release1 = 0, release2 = 0;
10420 int skind = PyUnicode_KIND(self);
10421 int kind1 = PyUnicode_KIND(str1);
10422 int kind2 = PyUnicode_KIND(str2);
10423 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10424 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10425 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010426 int mayshrink;
10427 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428
10429 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010430 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010432 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433
Victor Stinner59de0ee2011-10-07 10:01:28 +020010434 if (str1 == str2)
10435 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 if (skind < kind1)
10437 /* substring too wide to be present */
10438 goto nothing;
10439
Victor Stinner49a0a212011-10-12 23:46:10 +020010440 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10441 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10442 /* Replacing str1 with str2 may cause a maxchar reduction in the
10443 result string. */
10444 mayshrink = (maxchar_str2 < maxchar);
10445 maxchar = Py_MAX(maxchar, maxchar_str2);
10446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010448 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010450 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010452 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010453 Py_UCS4 u1, u2;
10454 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010455 Py_ssize_t index, pos;
10456 char *src;
10457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010459 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10460 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010464 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010466 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010468
10469 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10470 index = 0;
10471 src = sbuf;
10472 while (--maxcount)
10473 {
10474 pos++;
10475 src += pos * PyUnicode_KIND(self);
10476 slen -= pos;
10477 index += pos;
10478 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10479 if (pos < 0)
10480 break;
10481 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10482 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010483 }
10484 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 int rkind = skind;
10486 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010487 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 if (kind1 < rkind) {
10490 /* widen substring */
10491 buf1 = _PyUnicode_AsKind(str1, rkind);
10492 if (!buf1) goto error;
10493 release1 = 1;
10494 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010495 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010496 if (i < 0)
10497 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 if (rkind > kind2) {
10499 /* widen replacement */
10500 buf2 = _PyUnicode_AsKind(str2, rkind);
10501 if (!buf2) goto error;
10502 release2 = 1;
10503 }
10504 else if (rkind < kind2) {
10505 /* widen self and buf1 */
10506 rkind = kind2;
10507 if (release1) PyMem_Free(buf1);
10508 sbuf = _PyUnicode_AsKind(self, rkind);
10509 if (!sbuf) goto error;
10510 srelease = 1;
10511 buf1 = _PyUnicode_AsKind(str1, rkind);
10512 if (!buf1) goto error;
10513 release1 = 1;
10514 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010515 u = PyUnicode_New(slen, maxchar);
10516 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010518 assert(PyUnicode_KIND(u) == rkind);
10519 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010520
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010521 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010522 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010523 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010525 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010527
10528 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010529 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010530 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010531 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010532 if (i == -1)
10533 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010534 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010536 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010540 }
10541 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 Py_ssize_t n, i, j, ires;
10543 Py_ssize_t product, new_size;
10544 int rkind = skind;
10545 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010548 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 buf1 = _PyUnicode_AsKind(str1, rkind);
10550 if (!buf1) goto error;
10551 release1 = 1;
10552 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010553 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010554 if (n == 0)
10555 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010557 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 buf2 = _PyUnicode_AsKind(str2, rkind);
10559 if (!buf2) goto error;
10560 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010563 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 rkind = kind2;
10565 sbuf = _PyUnicode_AsKind(self, rkind);
10566 if (!sbuf) goto error;
10567 srelease = 1;
10568 if (release1) PyMem_Free(buf1);
10569 buf1 = _PyUnicode_AsKind(str1, rkind);
10570 if (!buf1) goto error;
10571 release1 = 1;
10572 }
10573 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10574 PyUnicode_GET_LENGTH(str1))); */
10575 product = n * (len2-len1);
10576 if ((product / (len2-len1)) != n) {
10577 PyErr_SetString(PyExc_OverflowError,
10578 "replace string is too long");
10579 goto error;
10580 }
10581 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010582 if (new_size == 0) {
10583 Py_INCREF(unicode_empty);
10584 u = unicode_empty;
10585 goto done;
10586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10588 PyErr_SetString(PyExc_OverflowError,
10589 "replace string is too long");
10590 goto error;
10591 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010592 u = PyUnicode_New(new_size, maxchar);
10593 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010595 assert(PyUnicode_KIND(u) == rkind);
10596 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 ires = i = 0;
10598 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010599 while (n-- > 0) {
10600 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010601 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010602 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010603 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010604 if (j == -1)
10605 break;
10606 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010607 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010608 memcpy(res + rkind * ires,
10609 sbuf + rkind * i,
10610 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010612 }
10613 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010615 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010617 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010623 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010624 memcpy(res + rkind * ires,
10625 sbuf + rkind * i,
10626 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010627 }
10628 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629 /* interleave */
10630 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010631 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010633 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010635 if (--n <= 0)
10636 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010637 memcpy(res + rkind * ires,
10638 sbuf + rkind * i,
10639 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 ires++;
10641 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010642 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010643 memcpy(res + rkind * ires,
10644 sbuf + rkind * i,
10645 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010647 }
10648
10649 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010650 unicode_adjust_maxchar(&u);
10651 if (u == NULL)
10652 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010654
10655 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (srelease)
10657 PyMem_FREE(sbuf);
10658 if (release1)
10659 PyMem_FREE(buf1);
10660 if (release2)
10661 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010662 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010664
Benjamin Peterson29060642009-01-31 22:14:21 +000010665 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010666 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 if (srelease)
10668 PyMem_FREE(sbuf);
10669 if (release1)
10670 PyMem_FREE(buf1);
10671 if (release2)
10672 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010673 return unicode_result_unchanged(self);
10674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 error:
10676 if (srelease && sbuf)
10677 PyMem_FREE(sbuf);
10678 if (release1 && buf1)
10679 PyMem_FREE(buf1);
10680 if (release2 && buf2)
10681 PyMem_FREE(buf2);
10682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683}
10684
10685/* --- Unicode Object Methods --------------------------------------------- */
10686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010687PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010688 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689\n\
10690Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010691characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692
10693static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010694unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010696 if (PyUnicode_READY(self) == -1)
10697 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010698 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699}
10700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010701PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010702 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703\n\
10704Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010705have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706
10707static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010708unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010710 if (PyUnicode_READY(self) == -1)
10711 return NULL;
10712 if (PyUnicode_GET_LENGTH(self) == 0)
10713 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010714 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715}
10716
Benjamin Petersond5890c82012-01-14 13:23:30 -050010717PyDoc_STRVAR(casefold__doc__,
10718 "S.casefold() -> str\n\
10719\n\
10720Return a version of S suitable for caseless comparisons.");
10721
10722static PyObject *
10723unicode_casefold(PyObject *self)
10724{
10725 if (PyUnicode_READY(self) == -1)
10726 return NULL;
10727 if (PyUnicode_IS_ASCII(self))
10728 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010729 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010730}
10731
10732
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010733/* Argument converter. Coerces to a single unicode character */
10734
10735static int
10736convert_uc(PyObject *obj, void *addr)
10737{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010739 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010740
Benjamin Peterson14339b62009-01-31 16:36:08 +000010741 uniobj = PyUnicode_FromObject(obj);
10742 if (uniobj == NULL) {
10743 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010744 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010745 return 0;
10746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010748 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010750 Py_DECREF(uniobj);
10751 return 0;
10752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010754 Py_DECREF(uniobj);
10755 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010756}
10757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010758PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010759 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010761Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010762done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763
10764static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010765unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010767 Py_ssize_t marg, left;
10768 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 Py_UCS4 fillchar = ' ';
10770
Victor Stinnere9a29352011-10-01 02:14:59 +020010771 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773
Benjamin Petersonbac79492012-01-14 13:34:47 -050010774 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775 return NULL;
10776
Victor Stinnerc4b49542011-12-11 22:44:26 +010010777 if (PyUnicode_GET_LENGTH(self) >= width)
10778 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779
Victor Stinnerc4b49542011-12-11 22:44:26 +010010780 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 left = marg / 2 + (marg & width & 1);
10782
Victor Stinner9310abb2011-10-05 00:59:23 +020010783 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784}
10785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786/* This function assumes that str1 and str2 are readied by the caller. */
10787
Marc-André Lemburge5034372000-08-08 08:04:29 +000010788static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010789unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 int kind1, kind2;
10792 void *data1, *data2;
10793 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 kind1 = PyUnicode_KIND(str1);
10796 kind2 = PyUnicode_KIND(str2);
10797 data1 = PyUnicode_DATA(str1);
10798 data2 = PyUnicode_DATA(str2);
10799 len1 = PyUnicode_GET_LENGTH(str1);
10800 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 for (i = 0; i < len1 && i < len2; ++i) {
10803 Py_UCS4 c1, c2;
10804 c1 = PyUnicode_READ(kind1, data1, i);
10805 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010806
10807 if (c1 != c2)
10808 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010809 }
10810
10811 return (len1 < len2) ? -1 : (len1 != len2);
10812}
10813
Alexander Belopolsky40018472011-02-26 01:02:56 +000010814int
10815PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10818 if (PyUnicode_READY(left) == -1 ||
10819 PyUnicode_READY(right) == -1)
10820 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010821 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010823 PyErr_Format(PyExc_TypeError,
10824 "Can't compare %.100s and %.100s",
10825 left->ob_type->tp_name,
10826 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827 return -1;
10828}
10829
Martin v. Löwis5b222132007-06-10 09:51:05 +000010830int
10831PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10832{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 Py_ssize_t i;
10834 int kind;
10835 void *data;
10836 Py_UCS4 chr;
10837
Victor Stinner910337b2011-10-03 03:20:16 +020010838 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 if (PyUnicode_READY(uni) == -1)
10840 return -1;
10841 kind = PyUnicode_KIND(uni);
10842 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010843 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10845 if (chr != str[i])
10846 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010847 /* This check keeps Python strings that end in '\0' from comparing equal
10848 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010850 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010851 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010852 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010853 return 0;
10854}
10855
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010856
Benjamin Peterson29060642009-01-31 22:14:21 +000010857#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010858 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010859
Alexander Belopolsky40018472011-02-26 01:02:56 +000010860PyObject *
10861PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010862{
10863 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010864
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010865 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10866 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 if (PyUnicode_READY(left) == -1 ||
10868 PyUnicode_READY(right) == -1)
10869 return NULL;
10870 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10871 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010872 if (op == Py_EQ) {
10873 Py_INCREF(Py_False);
10874 return Py_False;
10875 }
10876 if (op == Py_NE) {
10877 Py_INCREF(Py_True);
10878 return Py_True;
10879 }
10880 }
10881 if (left == right)
10882 result = 0;
10883 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010884 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010885
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010886 /* Convert the return value to a Boolean */
10887 switch (op) {
10888 case Py_EQ:
10889 v = TEST_COND(result == 0);
10890 break;
10891 case Py_NE:
10892 v = TEST_COND(result != 0);
10893 break;
10894 case Py_LE:
10895 v = TEST_COND(result <= 0);
10896 break;
10897 case Py_GE:
10898 v = TEST_COND(result >= 0);
10899 break;
10900 case Py_LT:
10901 v = TEST_COND(result == -1);
10902 break;
10903 case Py_GT:
10904 v = TEST_COND(result == 1);
10905 break;
10906 default:
10907 PyErr_BadArgument();
10908 return NULL;
10909 }
10910 Py_INCREF(v);
10911 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010912 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010913
Brian Curtindfc80e32011-08-10 20:28:54 -050010914 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010915}
10916
Alexander Belopolsky40018472011-02-26 01:02:56 +000010917int
10918PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010919{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010920 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 int kind1, kind2, kind;
10922 void *buf1, *buf2;
10923 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010924 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010925
10926 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010927 sub = PyUnicode_FromObject(element);
10928 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010929 PyErr_Format(PyExc_TypeError,
10930 "'in <string>' requires string as left operand, not %s",
10931 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010932 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010933 }
10934
Thomas Wouters477c8d52006-05-27 19:21:47 +000010935 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010936 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010937 Py_DECREF(sub);
10938 return -1;
10939 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010940 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10941 Py_DECREF(sub);
10942 Py_DECREF(str);
10943 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945 kind1 = PyUnicode_KIND(str);
10946 kind2 = PyUnicode_KIND(sub);
10947 kind = kind1 > kind2 ? kind1 : kind2;
10948 buf1 = PyUnicode_DATA(str);
10949 buf2 = PyUnicode_DATA(sub);
10950 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010951 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 if (!buf1) {
10953 Py_DECREF(sub);
10954 return -1;
10955 }
10956 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010957 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 if (!buf2) {
10959 Py_DECREF(sub);
10960 if (kind1 != kind) PyMem_Free(buf1);
10961 return -1;
10962 }
10963 len1 = PyUnicode_GET_LENGTH(str);
10964 len2 = PyUnicode_GET_LENGTH(sub);
10965
Benjamin Petersonead6b532011-12-20 17:23:42 -060010966 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 case PyUnicode_1BYTE_KIND:
10968 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10969 break;
10970 case PyUnicode_2BYTE_KIND:
10971 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10972 break;
10973 case PyUnicode_4BYTE_KIND:
10974 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10975 break;
10976 default:
10977 result = -1;
10978 assert(0);
10979 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010980
10981 Py_DECREF(str);
10982 Py_DECREF(sub);
10983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 if (kind1 != kind)
10985 PyMem_Free(buf1);
10986 if (kind2 != kind)
10987 PyMem_Free(buf2);
10988
Guido van Rossum403d68b2000-03-13 15:55:09 +000010989 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010990}
10991
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992/* Concat to string or Unicode object giving a new Unicode object. */
10993
Alexander Belopolsky40018472011-02-26 01:02:56 +000010994PyObject *
10995PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010998 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010999 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000
11001 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011004 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011007 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008
11009 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011010 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011014 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011015 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 }
11018
Victor Stinner488fa492011-12-12 00:01:39 +010011019 u_len = PyUnicode_GET_LENGTH(u);
11020 v_len = PyUnicode_GET_LENGTH(v);
11021 if (u_len > PY_SSIZE_T_MAX - v_len) {
11022 PyErr_SetString(PyExc_OverflowError,
11023 "strings are too large to concat");
11024 goto onError;
11025 }
11026 new_len = u_len + v_len;
11027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011029 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
11030 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011033 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011035 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010011036 copy_characters(w, 0, u, 0, u_len);
11037 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038 Py_DECREF(u);
11039 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011040 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042
Benjamin Peterson29060642009-01-31 22:14:21 +000011043 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044 Py_XDECREF(u);
11045 Py_XDECREF(v);
11046 return NULL;
11047}
11048
Walter Dörwald1ab83302007-05-18 17:15:44 +000011049void
Victor Stinner23e56682011-10-03 03:54:37 +020011050PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011051{
Victor Stinner23e56682011-10-03 03:54:37 +020011052 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011053 Py_UCS4 maxchar, maxchar2;
11054 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011055
11056 if (p_left == NULL) {
11057 if (!PyErr_Occurred())
11058 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011059 return;
11060 }
Victor Stinner23e56682011-10-03 03:54:37 +020011061 left = *p_left;
11062 if (right == NULL || !PyUnicode_Check(left)) {
11063 if (!PyErr_Occurred())
11064 PyErr_BadInternalCall();
11065 goto error;
11066 }
11067
Benjamin Petersonbac79492012-01-14 13:34:47 -050011068 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011069 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011070 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011071 goto error;
11072
Victor Stinner488fa492011-12-12 00:01:39 +010011073 /* Shortcuts */
11074 if (left == unicode_empty) {
11075 Py_DECREF(left);
11076 Py_INCREF(right);
11077 *p_left = right;
11078 return;
11079 }
11080 if (right == unicode_empty)
11081 return;
11082
11083 left_len = PyUnicode_GET_LENGTH(left);
11084 right_len = PyUnicode_GET_LENGTH(right);
11085 if (left_len > PY_SSIZE_T_MAX - right_len) {
11086 PyErr_SetString(PyExc_OverflowError,
11087 "strings are too large to concat");
11088 goto error;
11089 }
11090 new_len = left_len + right_len;
11091
11092 if (unicode_modifiable(left)
11093 && PyUnicode_CheckExact(right)
11094 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011095 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11096 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011097 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011098 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011099 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11100 {
11101 /* append inplace */
11102 if (unicode_resize(p_left, new_len) != 0) {
11103 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11104 * deallocated so it cannot be put back into
11105 * 'variable'. The MemoryError is raised when there
11106 * is no value in 'variable', which might (very
11107 * remotely) be a cause of incompatibilities.
11108 */
11109 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011110 }
Victor Stinner488fa492011-12-12 00:01:39 +010011111 /* copy 'right' into the newly allocated area of 'left' */
11112 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011113 }
Victor Stinner488fa492011-12-12 00:01:39 +010011114 else {
11115 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11116 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11117 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011118
Victor Stinner488fa492011-12-12 00:01:39 +010011119 /* Concat the two Unicode strings */
11120 res = PyUnicode_New(new_len, maxchar);
11121 if (res == NULL)
11122 goto error;
11123 copy_characters(res, 0, left, 0, left_len);
11124 copy_characters(res, left_len, right, 0, right_len);
11125 Py_DECREF(left);
11126 *p_left = res;
11127 }
11128 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011129 return;
11130
11131error:
Victor Stinner488fa492011-12-12 00:01:39 +010011132 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011133}
11134
11135void
11136PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11137{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011138 PyUnicode_Append(pleft, right);
11139 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011140}
11141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011142PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011143 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011145Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011146string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011147interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148
11149static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011150unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011152 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011153 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011154 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 int kind1, kind2, kind;
11157 void *buf1, *buf2;
11158 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159
Jesus Ceaac451502011-04-20 17:09:23 +020011160 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11161 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011162 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 kind1 = PyUnicode_KIND(self);
11165 kind2 = PyUnicode_KIND(substring);
11166 kind = kind1 > kind2 ? kind1 : kind2;
11167 buf1 = PyUnicode_DATA(self);
11168 buf2 = PyUnicode_DATA(substring);
11169 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011170 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 if (!buf1) {
11172 Py_DECREF(substring);
11173 return NULL;
11174 }
11175 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011176 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 if (!buf2) {
11178 Py_DECREF(substring);
11179 if (kind1 != kind) PyMem_Free(buf1);
11180 return NULL;
11181 }
11182 len1 = PyUnicode_GET_LENGTH(self);
11183 len2 = PyUnicode_GET_LENGTH(substring);
11184
11185 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011186 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 case PyUnicode_1BYTE_KIND:
11188 iresult = ucs1lib_count(
11189 ((Py_UCS1*)buf1) + start, end - start,
11190 buf2, len2, PY_SSIZE_T_MAX
11191 );
11192 break;
11193 case PyUnicode_2BYTE_KIND:
11194 iresult = ucs2lib_count(
11195 ((Py_UCS2*)buf1) + start, end - start,
11196 buf2, len2, PY_SSIZE_T_MAX
11197 );
11198 break;
11199 case PyUnicode_4BYTE_KIND:
11200 iresult = ucs4lib_count(
11201 ((Py_UCS4*)buf1) + start, end - start,
11202 buf2, len2, PY_SSIZE_T_MAX
11203 );
11204 break;
11205 default:
11206 assert(0); iresult = 0;
11207 }
11208
11209 result = PyLong_FromSsize_t(iresult);
11210
11211 if (kind1 != kind)
11212 PyMem_Free(buf1);
11213 if (kind2 != kind)
11214 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215
11216 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011217
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218 return result;
11219}
11220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011221PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011222 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011224Encode S using the codec registered for encoding. Default encoding\n\
11225is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011226handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011227a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11228'xmlcharrefreplace' as well as any other name registered with\n\
11229codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230
11231static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011232unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011234 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235 char *encoding = NULL;
11236 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011237
Benjamin Peterson308d6372009-09-18 21:42:35 +000011238 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11239 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011241 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011242}
11243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011244PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011245 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246\n\
11247Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011248If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249
11250static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011251unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011253 Py_ssize_t i, j, line_pos, src_len, incr;
11254 Py_UCS4 ch;
11255 PyObject *u;
11256 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011258 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011259 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260
11261 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263
Antoine Pitrou22425222011-10-04 19:10:51 +020011264 if (PyUnicode_READY(self) == -1)
11265 return NULL;
11266
Thomas Wouters7e474022000-07-16 12:04:32 +000011267 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011268 src_len = PyUnicode_GET_LENGTH(self);
11269 i = j = line_pos = 0;
11270 kind = PyUnicode_KIND(self);
11271 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011272 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011273 for (; i < src_len; i++) {
11274 ch = PyUnicode_READ(kind, src_data, i);
11275 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011276 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011277 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011278 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011280 goto overflow;
11281 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011282 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011283 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011286 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011287 goto overflow;
11288 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011290 if (ch == '\n' || ch == '\r')
11291 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011293 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011294 if (!found)
11295 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011296
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011298 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299 if (!u)
11300 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011301 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302
Antoine Pitroue71d5742011-10-04 15:55:09 +020011303 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304
Antoine Pitroue71d5742011-10-04 15:55:09 +020011305 for (; i < src_len; i++) {
11306 ch = PyUnicode_READ(kind, src_data, i);
11307 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011308 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011309 incr = tabsize - (line_pos % tabsize);
11310 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011311 FILL(kind, dest_data, ' ', j, incr);
11312 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011313 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011314 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011315 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011316 line_pos++;
11317 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011318 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011319 if (ch == '\n' || ch == '\r')
11320 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011322 }
11323 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011324 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011325
Antoine Pitroue71d5742011-10-04 15:55:09 +020011326 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011327 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11328 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329}
11330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011331PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333\n\
11334Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011335such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336arguments start and end are interpreted as in slice notation.\n\
11337\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011338Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339
11340static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011343 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011344 Py_ssize_t start;
11345 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011346 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347
Jesus Ceaac451502011-04-20 17:09:23 +020011348 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11349 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 if (PyUnicode_READY(self) == -1)
11353 return NULL;
11354 if (PyUnicode_READY(substring) == -1)
11355 return NULL;
11356
Victor Stinner7931d9a2011-11-04 00:22:48 +010011357 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358
11359 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 if (result == -2)
11362 return NULL;
11363
Christian Heimes217cfd12007-12-02 14:31:20 +000011364 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365}
11366
11367static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011368unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011370 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11371 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374}
11375
Guido van Rossumc2504932007-09-18 19:42:40 +000011376/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011377 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011378static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011379unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380{
Guido van Rossumc2504932007-09-18 19:42:40 +000011381 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011382 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011383
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011384#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011385 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011386#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 if (_PyUnicode_HASH(self) != -1)
11388 return _PyUnicode_HASH(self);
11389 if (PyUnicode_READY(self) == -1)
11390 return -1;
11391 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011392 /*
11393 We make the hash of the empty string be 0, rather than using
11394 (prefix ^ suffix), since this slightly obfuscates the hash secret
11395 */
11396 if (len == 0) {
11397 _PyUnicode_HASH(self) = 0;
11398 return 0;
11399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400
11401 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011402#define HASH(P) \
11403 x ^= (Py_uhash_t) *P << 7; \
11404 while (--len >= 0) \
11405 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406
Georg Brandl2fb477c2012-02-21 00:33:36 +010011407 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 switch (PyUnicode_KIND(self)) {
11409 case PyUnicode_1BYTE_KIND: {
11410 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11411 HASH(c);
11412 break;
11413 }
11414 case PyUnicode_2BYTE_KIND: {
11415 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11416 HASH(s);
11417 break;
11418 }
11419 default: {
11420 Py_UCS4 *l;
11421 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11422 "Impossible switch case in unicode_hash");
11423 l = PyUnicode_4BYTE_DATA(self);
11424 HASH(l);
11425 break;
11426 }
11427 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011428 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11429 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430
Guido van Rossumc2504932007-09-18 19:42:40 +000011431 if (x == -1)
11432 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011434 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011438PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011441Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
11443static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011446 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011447 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011448 Py_ssize_t start;
11449 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450
Jesus Ceaac451502011-04-20 17:09:23 +020011451 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11452 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 if (PyUnicode_READY(self) == -1)
11456 return NULL;
11457 if (PyUnicode_READY(substring) == -1)
11458 return NULL;
11459
Victor Stinner7931d9a2011-11-04 00:22:48 +010011460 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
11462 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 if (result == -2)
11465 return NULL;
11466
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467 if (result < 0) {
11468 PyErr_SetString(PyExc_ValueError, "substring not found");
11469 return NULL;
11470 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011471
Christian Heimes217cfd12007-12-02 14:31:20 +000011472 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473}
11474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011475PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011476 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011478Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011479at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480
11481static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011482unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 Py_ssize_t i, length;
11485 int kind;
11486 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487 int cased;
11488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 if (PyUnicode_READY(self) == -1)
11490 return NULL;
11491 length = PyUnicode_GET_LENGTH(self);
11492 kind = PyUnicode_KIND(self);
11493 data = PyUnicode_DATA(self);
11494
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011496 if (length == 1)
11497 return PyBool_FromLong(
11498 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011500 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011503
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 for (i = 0; i < length; i++) {
11506 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011507
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11509 return PyBool_FromLong(0);
11510 else if (!cased && Py_UNICODE_ISLOWER(ch))
11511 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011513 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514}
11515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011516PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011519Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011520at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521
11522static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011523unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 Py_ssize_t i, length;
11526 int kind;
11527 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528 int cased;
11529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 if (PyUnicode_READY(self) == -1)
11531 return NULL;
11532 length = PyUnicode_GET_LENGTH(self);
11533 kind = PyUnicode_KIND(self);
11534 data = PyUnicode_DATA(self);
11535
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 if (length == 1)
11538 return PyBool_FromLong(
11539 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011541 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011543 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011544
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 for (i = 0; i < length; i++) {
11547 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011548
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11550 return PyBool_FromLong(0);
11551 else if (!cased && Py_UNICODE_ISUPPER(ch))
11552 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011554 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555}
11556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011557PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011560Return True if S is a titlecased string and there is at least one\n\
11561character in S, i.e. upper- and titlecase characters may only\n\
11562follow uncased characters and lowercase characters only cased ones.\n\
11563Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564
11565static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011566unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568 Py_ssize_t i, length;
11569 int kind;
11570 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571 int cased, previous_is_cased;
11572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573 if (PyUnicode_READY(self) == -1)
11574 return NULL;
11575 length = PyUnicode_GET_LENGTH(self);
11576 kind = PyUnicode_KIND(self);
11577 data = PyUnicode_DATA(self);
11578
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 if (length == 1) {
11581 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11582 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11583 (Py_UNICODE_ISUPPER(ch) != 0));
11584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011586 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011589
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590 cased = 0;
11591 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 for (i = 0; i < length; i++) {
11593 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011594
Benjamin Peterson29060642009-01-31 22:14:21 +000011595 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11596 if (previous_is_cased)
11597 return PyBool_FromLong(0);
11598 previous_is_cased = 1;
11599 cased = 1;
11600 }
11601 else if (Py_UNICODE_ISLOWER(ch)) {
11602 if (!previous_is_cased)
11603 return PyBool_FromLong(0);
11604 previous_is_cased = 1;
11605 cased = 1;
11606 }
11607 else
11608 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011610 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611}
11612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011613PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011614 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011616Return True if all characters in S are whitespace\n\
11617and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
11619static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011620unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 Py_ssize_t i, length;
11623 int kind;
11624 void *data;
11625
11626 if (PyUnicode_READY(self) == -1)
11627 return NULL;
11628 length = PyUnicode_GET_LENGTH(self);
11629 kind = PyUnicode_KIND(self);
11630 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 if (length == 1)
11634 return PyBool_FromLong(
11635 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011637 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 for (i = 0; i < length; i++) {
11642 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011643 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011644 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011646 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647}
11648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011649PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011650 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011651\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011652Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011653and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011654
11655static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011656unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011657{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 Py_ssize_t i, length;
11659 int kind;
11660 void *data;
11661
11662 if (PyUnicode_READY(self) == -1)
11663 return NULL;
11664 length = PyUnicode_GET_LENGTH(self);
11665 kind = PyUnicode_KIND(self);
11666 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011667
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011668 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 if (length == 1)
11670 return PyBool_FromLong(
11671 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011672
11673 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011675 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 for (i = 0; i < length; i++) {
11678 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011679 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011680 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011681 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011682}
11683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011684PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011686\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011687Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011688and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011689
11690static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011691unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 int kind;
11694 void *data;
11695 Py_ssize_t len, i;
11696
11697 if (PyUnicode_READY(self) == -1)
11698 return NULL;
11699
11700 kind = PyUnicode_KIND(self);
11701 data = PyUnicode_DATA(self);
11702 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011703
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011704 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 if (len == 1) {
11706 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11707 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11708 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011709
11710 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011712 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 for (i = 0; i < len; i++) {
11715 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011716 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011717 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011718 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011719 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011720}
11721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011722PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011723 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011725Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011726False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727
11728static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011729unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 Py_ssize_t i, length;
11732 int kind;
11733 void *data;
11734
11735 if (PyUnicode_READY(self) == -1)
11736 return NULL;
11737 length = PyUnicode_GET_LENGTH(self);
11738 kind = PyUnicode_KIND(self);
11739 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 if (length == 1)
11743 return PyBool_FromLong(
11744 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011746 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011748 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 for (i = 0; i < length; i++) {
11751 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011752 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011754 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755}
11756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011757PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011760Return True if all characters in S are digits\n\
11761and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
11763static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011764unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 Py_ssize_t i, length;
11767 int kind;
11768 void *data;
11769
11770 if (PyUnicode_READY(self) == -1)
11771 return NULL;
11772 length = PyUnicode_GET_LENGTH(self);
11773 kind = PyUnicode_KIND(self);
11774 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 if (length == 1) {
11778 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11779 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11780 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011782 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011784 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 for (i = 0; i < length; i++) {
11787 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011790 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791}
11792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011793PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011794 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011796Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011797False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
11799static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011800unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 Py_ssize_t i, length;
11803 int kind;
11804 void *data;
11805
11806 if (PyUnicode_READY(self) == -1)
11807 return NULL;
11808 length = PyUnicode_GET_LENGTH(self);
11809 kind = PyUnicode_KIND(self);
11810 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 if (length == 1)
11814 return PyBool_FromLong(
11815 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011817 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011819 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 for (i = 0; i < length; i++) {
11822 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011823 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011825 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826}
11827
Martin v. Löwis47383402007-08-15 07:32:56 +000011828int
11829PyUnicode_IsIdentifier(PyObject *self)
11830{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 int kind;
11832 void *data;
11833 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011834 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 if (PyUnicode_READY(self) == -1) {
11837 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 }
11840
11841 /* Special case for empty strings */
11842 if (PyUnicode_GET_LENGTH(self) == 0)
11843 return 0;
11844 kind = PyUnicode_KIND(self);
11845 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011846
11847 /* PEP 3131 says that the first character must be in
11848 XID_Start and subsequent characters in XID_Continue,
11849 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011850 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011851 letters, digits, underscore). However, given the current
11852 definition of XID_Start and XID_Continue, it is sufficient
11853 to check just for these, except that _ must be allowed
11854 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011856 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011857 return 0;
11858
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011859 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011861 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011862 return 1;
11863}
11864
11865PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011866 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011867\n\
11868Return True if S is a valid identifier according\n\
11869to the language definition.");
11870
11871static PyObject*
11872unicode_isidentifier(PyObject *self)
11873{
11874 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11875}
11876
Georg Brandl559e5d72008-06-11 18:37:52 +000011877PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011879\n\
11880Return True if all characters in S are considered\n\
11881printable in repr() or S is empty, False otherwise.");
11882
11883static PyObject*
11884unicode_isprintable(PyObject *self)
11885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 Py_ssize_t i, length;
11887 int kind;
11888 void *data;
11889
11890 if (PyUnicode_READY(self) == -1)
11891 return NULL;
11892 length = PyUnicode_GET_LENGTH(self);
11893 kind = PyUnicode_KIND(self);
11894 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011895
11896 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 if (length == 1)
11898 return PyBool_FromLong(
11899 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 for (i = 0; i < length; i++) {
11902 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011903 Py_RETURN_FALSE;
11904 }
11905 }
11906 Py_RETURN_TRUE;
11907}
11908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011909PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011910 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911\n\
11912Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011913iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914
11915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011916unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011918 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919}
11920
Martin v. Löwis18e16552006-02-15 17:27:45 +000011921static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011922unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 if (PyUnicode_READY(self) == -1)
11925 return -1;
11926 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927}
11928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011929PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011932Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011933done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934
11935static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011936unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011938 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 Py_UCS4 fillchar = ' ';
11940
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011941 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942 return NULL;
11943
Benjamin Petersonbac79492012-01-14 13:34:47 -050011944 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946
Victor Stinnerc4b49542011-12-11 22:44:26 +010011947 if (PyUnicode_GET_LENGTH(self) >= width)
11948 return unicode_result_unchanged(self);
11949
11950 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951}
11952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011953PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011956Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957
11958static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011959unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011961 if (PyUnicode_READY(self) == -1)
11962 return NULL;
11963 if (PyUnicode_IS_ASCII(self))
11964 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011965 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966}
11967
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011968#define LEFTSTRIP 0
11969#define RIGHTSTRIP 1
11970#define BOTHSTRIP 2
11971
11972/* Arrays indexed by above */
11973static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11974
11975#define STRIPNAME(i) (stripformat[i]+3)
11976
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011977/* externally visible for str.strip(unicode) */
11978PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011979_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 void *data;
11982 int kind;
11983 Py_ssize_t i, j, len;
11984 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11987 return NULL;
11988
11989 kind = PyUnicode_KIND(self);
11990 data = PyUnicode_DATA(self);
11991 len = PyUnicode_GET_LENGTH(self);
11992 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11993 PyUnicode_DATA(sepobj),
11994 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011995
Benjamin Peterson14339b62009-01-31 16:36:08 +000011996 i = 0;
11997 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 while (i < len &&
11999 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 i++;
12001 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012002 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012003
Benjamin Peterson14339b62009-01-31 16:36:08 +000012004 j = len;
12005 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012006 do {
12007 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 } while (j >= i &&
12009 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012011 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012012
Victor Stinner7931d9a2011-11-04 00:22:48 +010012013 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014}
12015
12016PyObject*
12017PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12018{
12019 unsigned char *data;
12020 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012021 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022
Victor Stinnerde636f32011-10-01 03:55:54 +020012023 if (PyUnicode_READY(self) == -1)
12024 return NULL;
12025
12026 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
12027
Victor Stinner12bab6d2011-10-01 01:53:49 +020012028 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010012029 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030
Victor Stinner12bab6d2011-10-01 01:53:49 +020012031 length = end - start;
12032 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012033 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034
Victor Stinnerde636f32011-10-01 03:55:54 +020012035 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012036 PyErr_SetString(PyExc_IndexError, "string index out of range");
12037 return NULL;
12038 }
12039
Victor Stinnerb9275c12011-10-05 14:01:42 +020012040 if (PyUnicode_IS_ASCII(self)) {
12041 kind = PyUnicode_KIND(self);
12042 data = PyUnicode_1BYTE_DATA(self);
12043 return unicode_fromascii(data + start, length);
12044 }
12045 else {
12046 kind = PyUnicode_KIND(self);
12047 data = PyUnicode_1BYTE_DATA(self);
12048 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012049 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012050 length);
12051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053
12054static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012055do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 int kind;
12058 void *data;
12059 Py_ssize_t len, i, j;
12060
12061 if (PyUnicode_READY(self) == -1)
12062 return NULL;
12063
12064 kind = PyUnicode_KIND(self);
12065 data = PyUnicode_DATA(self);
12066 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012067
Benjamin Peterson14339b62009-01-31 16:36:08 +000012068 i = 0;
12069 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012071 i++;
12072 }
12073 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012074
Benjamin Peterson14339b62009-01-31 16:36:08 +000012075 j = len;
12076 if (striptype != LEFTSTRIP) {
12077 do {
12078 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012080 j++;
12081 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012082
Victor Stinner7931d9a2011-11-04 00:22:48 +010012083 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084}
12085
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012086
12087static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012088do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012089{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012090 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012091
Benjamin Peterson14339b62009-01-31 16:36:08 +000012092 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12093 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012094
Benjamin Peterson14339b62009-01-31 16:36:08 +000012095 if (sep != NULL && sep != Py_None) {
12096 if (PyUnicode_Check(sep))
12097 return _PyUnicode_XStrip(self, striptype, sep);
12098 else {
12099 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012100 "%s arg must be None or str",
12101 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012102 return NULL;
12103 }
12104 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012105
Benjamin Peterson14339b62009-01-31 16:36:08 +000012106 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012107}
12108
12109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012110PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012111 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012112\n\
12113Return a copy of the string S with leading and trailing\n\
12114whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012115If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116
12117static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012118unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012119{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012120 if (PyTuple_GET_SIZE(args) == 0)
12121 return do_strip(self, BOTHSTRIP); /* Common case */
12122 else
12123 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012124}
12125
12126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012127PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012128 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012129\n\
12130Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012131If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012132
12133static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012134unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012135{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012136 if (PyTuple_GET_SIZE(args) == 0)
12137 return do_strip(self, LEFTSTRIP); /* Common case */
12138 else
12139 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012140}
12141
12142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012143PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012144 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012145\n\
12146Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012147If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012148
12149static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012150unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012151{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012152 if (PyTuple_GET_SIZE(args) == 0)
12153 return do_strip(self, RIGHTSTRIP); /* Common case */
12154 else
12155 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012156}
12157
12158
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012160unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012162 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164
Georg Brandl222de0f2009-04-12 12:01:50 +000012165 if (len < 1) {
12166 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012167 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169
Victor Stinnerc4b49542011-12-11 22:44:26 +010012170 /* no repeat, return original string */
12171 if (len == 1)
12172 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012173
Benjamin Petersonbac79492012-01-14 13:34:47 -050012174 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 return NULL;
12176
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012177 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012178 PyErr_SetString(PyExc_OverflowError,
12179 "repeated string is too long");
12180 return NULL;
12181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012183
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012184 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185 if (!u)
12186 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012187 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 if (PyUnicode_GET_LENGTH(str) == 1) {
12190 const int kind = PyUnicode_KIND(str);
12191 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012192 if (kind == PyUnicode_1BYTE_KIND) {
12193 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012194 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012195 }
12196 else if (kind == PyUnicode_2BYTE_KIND) {
12197 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012198 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012199 ucs2[n] = fill_char;
12200 } else {
12201 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12202 assert(kind == PyUnicode_4BYTE_KIND);
12203 for (n = 0; n < len; ++n)
12204 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 }
12207 else {
12208 /* number of characters copied this far */
12209 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012210 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 char *to = (char *) PyUnicode_DATA(u);
12212 Py_MEMCPY(to, PyUnicode_DATA(str),
12213 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012214 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 n = (done <= nchars-done) ? done : nchars-done;
12216 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012217 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219 }
12220
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012221 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012222 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223}
12224
Alexander Belopolsky40018472011-02-26 01:02:56 +000012225PyObject *
12226PyUnicode_Replace(PyObject *obj,
12227 PyObject *subobj,
12228 PyObject *replobj,
12229 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230{
12231 PyObject *self;
12232 PyObject *str1;
12233 PyObject *str2;
12234 PyObject *result;
12235
12236 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012237 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012240 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012241 Py_DECREF(self);
12242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243 }
12244 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012245 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 Py_DECREF(self);
12247 Py_DECREF(str1);
12248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012250 if (PyUnicode_READY(self) == -1 ||
12251 PyUnicode_READY(str1) == -1 ||
12252 PyUnicode_READY(str2) == -1)
12253 result = NULL;
12254 else
12255 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 Py_DECREF(self);
12257 Py_DECREF(str1);
12258 Py_DECREF(str2);
12259 return result;
12260}
12261
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012262PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012263 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264\n\
12265Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012266old replaced by new. If the optional argument count is\n\
12267given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268
12269static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 PyObject *str1;
12273 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012274 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275 PyObject *result;
12276
Martin v. Löwis18e16552006-02-15 17:27:45 +000012277 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012279 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012280 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012282 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 return NULL;
12284 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012285 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 Py_DECREF(str1);
12287 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012288 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012289 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12290 result = NULL;
12291 else
12292 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293
12294 Py_DECREF(str1);
12295 Py_DECREF(str2);
12296 return result;
12297}
12298
Alexander Belopolsky40018472011-02-26 01:02:56 +000012299static PyObject *
12300unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012302 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 Py_ssize_t isize;
12304 Py_ssize_t osize, squote, dquote, i, o;
12305 Py_UCS4 max, quote;
12306 int ikind, okind;
12307 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012310 return NULL;
12311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 isize = PyUnicode_GET_LENGTH(unicode);
12313 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 /* Compute length of output, quote characters, and
12316 maximum character */
12317 osize = 2; /* quotes */
12318 max = 127;
12319 squote = dquote = 0;
12320 ikind = PyUnicode_KIND(unicode);
12321 for (i = 0; i < isize; i++) {
12322 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12323 switch (ch) {
12324 case '\'': squote++; osize++; break;
12325 case '"': dquote++; osize++; break;
12326 case '\\': case '\t': case '\r': case '\n':
12327 osize += 2; break;
12328 default:
12329 /* Fast-path ASCII */
12330 if (ch < ' ' || ch == 0x7f)
12331 osize += 4; /* \xHH */
12332 else if (ch < 0x7f)
12333 osize++;
12334 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12335 osize++;
12336 max = ch > max ? ch : max;
12337 }
12338 else if (ch < 0x100)
12339 osize += 4; /* \xHH */
12340 else if (ch < 0x10000)
12341 osize += 6; /* \uHHHH */
12342 else
12343 osize += 10; /* \uHHHHHHHH */
12344 }
12345 }
12346
12347 quote = '\'';
12348 if (squote) {
12349 if (dquote)
12350 /* Both squote and dquote present. Use squote,
12351 and escape them */
12352 osize += squote;
12353 else
12354 quote = '"';
12355 }
12356
12357 repr = PyUnicode_New(osize, max);
12358 if (repr == NULL)
12359 return NULL;
12360 okind = PyUnicode_KIND(repr);
12361 odata = PyUnicode_DATA(repr);
12362
12363 PyUnicode_WRITE(okind, odata, 0, quote);
12364 PyUnicode_WRITE(okind, odata, osize-1, quote);
12365
12366 for (i = 0, o = 1; i < isize; i++) {
12367 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012368
12369 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 if ((ch == quote) || (ch == '\\')) {
12371 PyUnicode_WRITE(okind, odata, o++, '\\');
12372 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012373 continue;
12374 }
12375
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012377 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 PyUnicode_WRITE(okind, odata, o++, '\\');
12379 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012380 }
12381 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 PyUnicode_WRITE(okind, odata, o++, '\\');
12383 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012384 }
12385 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 PyUnicode_WRITE(okind, odata, o++, '\\');
12387 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012388 }
12389
12390 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012391 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 PyUnicode_WRITE(okind, odata, o++, '\\');
12393 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012394 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12395 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012396 }
12397
Georg Brandl559e5d72008-06-11 18:37:52 +000012398 /* Copy ASCII characters as-is */
12399 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012401 }
12402
Benjamin Peterson29060642009-01-31 22:14:21 +000012403 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012404 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012405 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012406 (categories Z* and C* except ASCII space)
12407 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012409 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 if (ch <= 0xff) {
12411 PyUnicode_WRITE(okind, odata, o++, '\\');
12412 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012413 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12414 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012415 }
12416 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 else if (ch >= 0x10000) {
12418 PyUnicode_WRITE(okind, odata, o++, '\\');
12419 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012420 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12421 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12422 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12423 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12424 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12425 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12426 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12427 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012428 }
12429 /* Map 16-bit characters to '\uxxxx' */
12430 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 PyUnicode_WRITE(okind, odata, o++, '\\');
12432 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012433 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12434 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12435 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12436 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012437 }
12438 }
12439 /* Copy characters as-is */
12440 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012442 }
12443 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012446 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012447 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448}
12449
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012450PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012451 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452\n\
12453Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012454such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455arguments start and end are interpreted as in slice notation.\n\
12456\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012457Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458
12459static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012462 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012463 Py_ssize_t start;
12464 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012465 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466
Jesus Ceaac451502011-04-20 17:09:23 +020012467 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12468 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 if (PyUnicode_READY(self) == -1)
12472 return NULL;
12473 if (PyUnicode_READY(substring) == -1)
12474 return NULL;
12475
Victor Stinner7931d9a2011-11-04 00:22:48 +010012476 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477
12478 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 if (result == -2)
12481 return NULL;
12482
Christian Heimes217cfd12007-12-02 14:31:20 +000012483 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484}
12485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012486PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012489Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
12491static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012494 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012495 Py_ssize_t start;
12496 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012497 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498
Jesus Ceaac451502011-04-20 17:09:23 +020012499 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12500 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012503 if (PyUnicode_READY(self) == -1)
12504 return NULL;
12505 if (PyUnicode_READY(substring) == -1)
12506 return NULL;
12507
Victor Stinner7931d9a2011-11-04 00:22:48 +010012508 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509
12510 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 if (result == -2)
12513 return NULL;
12514
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515 if (result < 0) {
12516 PyErr_SetString(PyExc_ValueError, "substring not found");
12517 return NULL;
12518 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519
Christian Heimes217cfd12007-12-02 14:31:20 +000012520 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521}
12522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012523PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012524 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012526Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012527done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528
12529static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012530unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012532 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 Py_UCS4 fillchar = ' ';
12534
Victor Stinnere9a29352011-10-01 02:14:59 +020012535 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012537
Benjamin Petersonbac79492012-01-14 13:34:47 -050012538 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539 return NULL;
12540
Victor Stinnerc4b49542011-12-11 22:44:26 +010012541 if (PyUnicode_GET_LENGTH(self) >= width)
12542 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543
Victor Stinnerc4b49542011-12-11 22:44:26 +010012544 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545}
12546
Alexander Belopolsky40018472011-02-26 01:02:56 +000012547PyObject *
12548PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549{
12550 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012551
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552 s = PyUnicode_FromObject(s);
12553 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012554 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012555 if (sep != NULL) {
12556 sep = PyUnicode_FromObject(sep);
12557 if (sep == NULL) {
12558 Py_DECREF(s);
12559 return NULL;
12560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561 }
12562
Victor Stinner9310abb2011-10-05 00:59:23 +020012563 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564
12565 Py_DECREF(s);
12566 Py_XDECREF(sep);
12567 return result;
12568}
12569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012570PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012571 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572\n\
12573Return a list of the words in S, using sep as the\n\
12574delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012575splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012576whitespace string is a separator and empty strings are\n\
12577removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578
12579static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012580unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012582 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012584 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012586 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12587 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588 return NULL;
12589
12590 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012593 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012595 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596}
12597
Thomas Wouters477c8d52006-05-27 19:21:47 +000012598PyObject *
12599PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12600{
12601 PyObject* str_obj;
12602 PyObject* sep_obj;
12603 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 int kind1, kind2, kind;
12605 void *buf1 = NULL, *buf2 = NULL;
12606 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012607
12608 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012609 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012611 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012612 if (!sep_obj) {
12613 Py_DECREF(str_obj);
12614 return NULL;
12615 }
12616 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12617 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012618 Py_DECREF(str_obj);
12619 return NULL;
12620 }
12621
Victor Stinner14f8f022011-10-05 20:58:25 +020012622 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012624 kind = Py_MAX(kind1, kind2);
12625 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012627 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 if (!buf1)
12629 goto onError;
12630 buf2 = PyUnicode_DATA(sep_obj);
12631 if (kind2 != kind)
12632 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12633 if (!buf2)
12634 goto onError;
12635 len1 = PyUnicode_GET_LENGTH(str_obj);
12636 len2 = PyUnicode_GET_LENGTH(sep_obj);
12637
Benjamin Petersonead6b532011-12-20 17:23:42 -060012638 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012640 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12641 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12642 else
12643 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 break;
12645 case PyUnicode_2BYTE_KIND:
12646 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12647 break;
12648 case PyUnicode_4BYTE_KIND:
12649 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12650 break;
12651 default:
12652 assert(0);
12653 out = 0;
12654 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012655
12656 Py_DECREF(sep_obj);
12657 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 if (kind1 != kind)
12659 PyMem_Free(buf1);
12660 if (kind2 != kind)
12661 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012662
12663 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 onError:
12665 Py_DECREF(sep_obj);
12666 Py_DECREF(str_obj);
12667 if (kind1 != kind && buf1)
12668 PyMem_Free(buf1);
12669 if (kind2 != kind && buf2)
12670 PyMem_Free(buf2);
12671 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012672}
12673
12674
12675PyObject *
12676PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12677{
12678 PyObject* str_obj;
12679 PyObject* sep_obj;
12680 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 int kind1, kind2, kind;
12682 void *buf1 = NULL, *buf2 = NULL;
12683 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012684
12685 str_obj = PyUnicode_FromObject(str_in);
12686 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012687 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012688 sep_obj = PyUnicode_FromObject(sep_in);
12689 if (!sep_obj) {
12690 Py_DECREF(str_obj);
12691 return NULL;
12692 }
12693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 kind1 = PyUnicode_KIND(str_in);
12695 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012696 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 buf1 = PyUnicode_DATA(str_in);
12698 if (kind1 != kind)
12699 buf1 = _PyUnicode_AsKind(str_in, kind);
12700 if (!buf1)
12701 goto onError;
12702 buf2 = PyUnicode_DATA(sep_obj);
12703 if (kind2 != kind)
12704 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12705 if (!buf2)
12706 goto onError;
12707 len1 = PyUnicode_GET_LENGTH(str_obj);
12708 len2 = PyUnicode_GET_LENGTH(sep_obj);
12709
Benjamin Petersonead6b532011-12-20 17:23:42 -060012710 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012712 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12713 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12714 else
12715 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 break;
12717 case PyUnicode_2BYTE_KIND:
12718 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12719 break;
12720 case PyUnicode_4BYTE_KIND:
12721 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12722 break;
12723 default:
12724 assert(0);
12725 out = 0;
12726 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012727
12728 Py_DECREF(sep_obj);
12729 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 if (kind1 != kind)
12731 PyMem_Free(buf1);
12732 if (kind2 != kind)
12733 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012734
12735 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 onError:
12737 Py_DECREF(sep_obj);
12738 Py_DECREF(str_obj);
12739 if (kind1 != kind && buf1)
12740 PyMem_Free(buf1);
12741 if (kind2 != kind && buf2)
12742 PyMem_Free(buf2);
12743 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012744}
12745
12746PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012748\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012749Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012750the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012751found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012752
12753static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012754unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012755{
Victor Stinner9310abb2011-10-05 00:59:23 +020012756 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012757}
12758
12759PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012760 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012761\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012762Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012763the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012764separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012765
12766static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012767unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012768{
Victor Stinner9310abb2011-10-05 00:59:23 +020012769 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012770}
12771
Alexander Belopolsky40018472011-02-26 01:02:56 +000012772PyObject *
12773PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012774{
12775 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012776
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012777 s = PyUnicode_FromObject(s);
12778 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012779 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 if (sep != NULL) {
12781 sep = PyUnicode_FromObject(sep);
12782 if (sep == NULL) {
12783 Py_DECREF(s);
12784 return NULL;
12785 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012786 }
12787
Victor Stinner9310abb2011-10-05 00:59:23 +020012788 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012789
12790 Py_DECREF(s);
12791 Py_XDECREF(sep);
12792 return result;
12793}
12794
12795PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012796 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012797\n\
12798Return a list of the words in S, using sep as the\n\
12799delimiter string, starting at the end of the string and\n\
12800working to the front. If maxsplit is given, at most maxsplit\n\
12801splits are done. If sep is not specified, any whitespace string\n\
12802is a separator.");
12803
12804static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012805unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012806{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012807 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012808 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012809 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012810
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012811 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12812 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012813 return NULL;
12814
12815 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012816 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012817 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012818 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012819 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012820 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012821}
12822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012823PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825\n\
12826Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012827Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012828is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829
12830static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012831unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012833 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012834 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012836 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12837 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838 return NULL;
12839
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012840 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841}
12842
12843static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012844PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012846 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847}
12848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012849PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012850 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851\n\
12852Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012853and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854
12855static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012856unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012858 if (PyUnicode_READY(self) == -1)
12859 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012860 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861}
12862
Georg Brandlceee0772007-11-27 23:48:05 +000012863PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012865\n\
12866Return a translation table usable for str.translate().\n\
12867If there is only one argument, it must be a dictionary mapping Unicode\n\
12868ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012869Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012870If there are two arguments, they must be strings of equal length, and\n\
12871in the resulting dictionary, each character in x will be mapped to the\n\
12872character at the same position in y. If there is a third argument, it\n\
12873must be a string, whose characters will be mapped to None in the result.");
12874
12875static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012876unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012877{
12878 PyObject *x, *y = NULL, *z = NULL;
12879 PyObject *new = NULL, *key, *value;
12880 Py_ssize_t i = 0;
12881 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012882
Georg Brandlceee0772007-11-27 23:48:05 +000012883 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12884 return NULL;
12885 new = PyDict_New();
12886 if (!new)
12887 return NULL;
12888 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012889 int x_kind, y_kind, z_kind;
12890 void *x_data, *y_data, *z_data;
12891
Georg Brandlceee0772007-11-27 23:48:05 +000012892 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012893 if (!PyUnicode_Check(x)) {
12894 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12895 "be a string if there is a second argument");
12896 goto err;
12897 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012899 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12900 "arguments must have equal length");
12901 goto err;
12902 }
12903 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 x_kind = PyUnicode_KIND(x);
12905 y_kind = PyUnicode_KIND(y);
12906 x_data = PyUnicode_DATA(x);
12907 y_data = PyUnicode_DATA(y);
12908 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12909 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012910 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012911 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012912 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012913 if (!value) {
12914 Py_DECREF(key);
12915 goto err;
12916 }
Georg Brandlceee0772007-11-27 23:48:05 +000012917 res = PyDict_SetItem(new, key, value);
12918 Py_DECREF(key);
12919 Py_DECREF(value);
12920 if (res < 0)
12921 goto err;
12922 }
12923 /* create entries for deleting chars in z */
12924 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 z_kind = PyUnicode_KIND(z);
12926 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012927 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012929 if (!key)
12930 goto err;
12931 res = PyDict_SetItem(new, key, Py_None);
12932 Py_DECREF(key);
12933 if (res < 0)
12934 goto err;
12935 }
12936 }
12937 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938 int kind;
12939 void *data;
12940
Georg Brandlceee0772007-11-27 23:48:05 +000012941 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012942 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012943 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12944 "to maketrans it must be a dict");
12945 goto err;
12946 }
12947 /* copy entries into the new dict, converting string keys to int keys */
12948 while (PyDict_Next(x, &i, &key, &value)) {
12949 if (PyUnicode_Check(key)) {
12950 /* convert string keys to integer keys */
12951 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012952 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012953 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12954 "table must be of length 1");
12955 goto err;
12956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 kind = PyUnicode_KIND(key);
12958 data = PyUnicode_DATA(key);
12959 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012960 if (!newkey)
12961 goto err;
12962 res = PyDict_SetItem(new, newkey, value);
12963 Py_DECREF(newkey);
12964 if (res < 0)
12965 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012966 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012967 /* just keep integer keys */
12968 if (PyDict_SetItem(new, key, value) < 0)
12969 goto err;
12970 } else {
12971 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12972 "be strings or integers");
12973 goto err;
12974 }
12975 }
12976 }
12977 return new;
12978 err:
12979 Py_DECREF(new);
12980 return NULL;
12981}
12982
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012983PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012984 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012985\n\
12986Return a copy of the string S, where all characters have been mapped\n\
12987through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012988Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012989Unmapped characters are left untouched. Characters mapped to None\n\
12990are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012991
12992static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012994{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012996}
12997
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012998PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012999 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013000\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013001Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002
13003static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013004unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013005{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013006 if (PyUnicode_READY(self) == -1)
13007 return NULL;
13008 if (PyUnicode_IS_ASCII(self))
13009 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013010 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011}
13012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013013PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013014 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013015\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013016Pad a numeric string S with zeros on the left, to fill a field\n\
13017of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013018
13019static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013020unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013021{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013022 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013023 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013024 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 int kind;
13026 void *data;
13027 Py_UCS4 chr;
13028
Martin v. Löwis18e16552006-02-15 17:27:45 +000013029 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013030 return NULL;
13031
Benjamin Petersonbac79492012-01-14 13:34:47 -050013032 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013033 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013034
Victor Stinnerc4b49542011-12-11 22:44:26 +010013035 if (PyUnicode_GET_LENGTH(self) >= width)
13036 return unicode_result_unchanged(self);
13037
13038 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013039
13040 u = pad(self, fill, 0, '0');
13041
Walter Dörwald068325e2002-04-15 13:36:47 +000013042 if (u == NULL)
13043 return NULL;
13044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013045 kind = PyUnicode_KIND(u);
13046 data = PyUnicode_DATA(u);
13047 chr = PyUnicode_READ(kind, data, fill);
13048
13049 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013051 PyUnicode_WRITE(kind, data, 0, chr);
13052 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053 }
13054
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013055 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013056 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058
13059#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013060static PyObject *
13061unicode__decimal2ascii(PyObject *self)
13062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013064}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065#endif
13066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013067PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013068 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013070Return True if S starts with the specified prefix, False otherwise.\n\
13071With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013072With optional end, stop comparing S at that position.\n\
13073prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074
13075static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013076unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013077 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013079 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013080 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013081 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013082 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013083 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084
Jesus Ceaac451502011-04-20 17:09:23 +020013085 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013086 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013087 if (PyTuple_Check(subobj)) {
13088 Py_ssize_t i;
13089 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013090 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013091 if (substring == NULL)
13092 return NULL;
13093 result = tailmatch(self, substring, start, end, -1);
13094 Py_DECREF(substring);
13095 if (result) {
13096 Py_RETURN_TRUE;
13097 }
13098 }
13099 /* nothing matched */
13100 Py_RETURN_FALSE;
13101 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013102 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013103 if (substring == NULL) {
13104 if (PyErr_ExceptionMatches(PyExc_TypeError))
13105 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13106 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013107 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013108 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013109 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013111 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112}
13113
13114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013115PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013116 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013118Return True if S ends with the specified suffix, False otherwise.\n\
13119With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013120With optional end, stop comparing S at that position.\n\
13121suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122
13123static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013124unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013125 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013127 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013128 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013129 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013130 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013131 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132
Jesus Ceaac451502011-04-20 17:09:23 +020013133 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013134 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013135 if (PyTuple_Check(subobj)) {
13136 Py_ssize_t i;
13137 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013138 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013140 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013142 result = tailmatch(self, substring, start, end, +1);
13143 Py_DECREF(substring);
13144 if (result) {
13145 Py_RETURN_TRUE;
13146 }
13147 }
13148 Py_RETURN_FALSE;
13149 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013150 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013151 if (substring == NULL) {
13152 if (PyErr_ExceptionMatches(PyExc_TypeError))
13153 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13154 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013155 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013156 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013157 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013158 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013159 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013160}
13161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013162#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013163
13164PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013165 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013166\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013167Return a formatted version of S, using substitutions from args and kwargs.\n\
13168The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013169
Eric Smith27bbca62010-11-04 17:06:58 +000013170PyDoc_STRVAR(format_map__doc__,
13171 "S.format_map(mapping) -> str\n\
13172\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013173Return a formatted version of S, using substitutions from mapping.\n\
13174The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013175
Eric Smith4a7d76d2008-05-30 18:10:19 +000013176static PyObject *
13177unicode__format__(PyObject* self, PyObject* args)
13178{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013179 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013180
13181 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13182 return NULL;
13183
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013184 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013185 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013186 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013187}
13188
Eric Smith8c663262007-08-25 02:26:07 +000013189PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013190 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013191\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013192Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013193
13194static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013195unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013196{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013197 Py_ssize_t size;
13198
13199 /* If it's a compact object, account for base structure +
13200 character data. */
13201 if (PyUnicode_IS_COMPACT_ASCII(v))
13202 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13203 else if (PyUnicode_IS_COMPACT(v))
13204 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013205 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013206 else {
13207 /* If it is a two-block object, account for base object, and
13208 for character block if present. */
13209 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013210 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013211 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013212 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 }
13214 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013215 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013216 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013217 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013218 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013219 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013220
13221 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013222}
13223
13224PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013225 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013226
13227static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013228unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013229{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013230 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013231 if (!copy)
13232 return NULL;
13233 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013234}
13235
Guido van Rossumd57fd912000-03-10 22:53:23 +000013236static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013237 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013238 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013239 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13240 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013241 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13242 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013243 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013244 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13245 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13246 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13247 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13248 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013249 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013250 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13251 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13252 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013253 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013254 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13255 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13256 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013257 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013258 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013259 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013260 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013261 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13262 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13263 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13264 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13265 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13266 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13267 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13268 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13269 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13270 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13271 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13272 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13273 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13274 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013275 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013276 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013277 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013278 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013279 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013280 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013281 {"maketrans", (PyCFunction) unicode_maketrans,
13282 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013283 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013284#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013285 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013286 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287#endif
13288
Benjamin Peterson14339b62009-01-31 16:36:08 +000013289 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013290 {NULL, NULL}
13291};
13292
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013293static PyObject *
13294unicode_mod(PyObject *v, PyObject *w)
13295{
Brian Curtindfc80e32011-08-10 20:28:54 -050013296 if (!PyUnicode_Check(v))
13297 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013298 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013299}
13300
13301static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013302 0, /*nb_add*/
13303 0, /*nb_subtract*/
13304 0, /*nb_multiply*/
13305 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013306};
13307
Guido van Rossumd57fd912000-03-10 22:53:23 +000013308static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013309 (lenfunc) unicode_length, /* sq_length */
13310 PyUnicode_Concat, /* sq_concat */
13311 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13312 (ssizeargfunc) unicode_getitem, /* sq_item */
13313 0, /* sq_slice */
13314 0, /* sq_ass_item */
13315 0, /* sq_ass_slice */
13316 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013317};
13318
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013319static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013320unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013321{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 if (PyUnicode_READY(self) == -1)
13323 return NULL;
13324
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013325 if (PyIndex_Check(item)) {
13326 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013327 if (i == -1 && PyErr_Occurred())
13328 return NULL;
13329 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013330 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013331 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013332 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013333 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013334 PyObject *result;
13335 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013336 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013337 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013339 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013340 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013341 return NULL;
13342 }
13343
13344 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013345 Py_INCREF(unicode_empty);
13346 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013347 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013348 slicelength == PyUnicode_GET_LENGTH(self)) {
13349 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013350 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013351 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013352 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013353 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013354 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013355 src_kind = PyUnicode_KIND(self);
13356 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013357 if (!PyUnicode_IS_ASCII(self)) {
13358 kind_limit = kind_maxchar_limit(src_kind);
13359 max_char = 0;
13360 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13361 ch = PyUnicode_READ(src_kind, src_data, cur);
13362 if (ch > max_char) {
13363 max_char = ch;
13364 if (max_char >= kind_limit)
13365 break;
13366 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013367 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013368 }
Victor Stinner55c99112011-10-13 01:17:06 +020013369 else
13370 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013371 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013372 if (result == NULL)
13373 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013374 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013375 dest_data = PyUnicode_DATA(result);
13376
13377 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013378 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13379 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013380 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013381 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013382 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013383 } else {
13384 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13385 return NULL;
13386 }
13387}
13388
13389static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013390 (lenfunc)unicode_length, /* mp_length */
13391 (binaryfunc)unicode_subscript, /* mp_subscript */
13392 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013393};
13394
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395
Guido van Rossumd57fd912000-03-10 22:53:23 +000013396/* Helpers for PyUnicode_Format() */
13397
13398static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013399getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013400{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013401 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013402 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013403 (*p_argidx)++;
13404 if (arglen < 0)
13405 return args;
13406 else
13407 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013408 }
13409 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013411 return NULL;
13412}
13413
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013414/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013415
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013416static PyObject *
13417formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013418{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013419 char *p;
13420 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013421 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013422
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423 x = PyFloat_AsDouble(v);
13424 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013425 return NULL;
13426
Guido van Rossumd57fd912000-03-10 22:53:23 +000013427 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013429
Eric Smith0923d1d2009-04-16 20:16:10 +000013430 p = PyOS_double_to_string(x, type, prec,
13431 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013432 if (p == NULL)
13433 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013434 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013435 PyMem_Free(p);
13436 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437}
13438
Victor Stinnerd0880d52012-04-27 23:40:13 +020013439/* formatlong() emulates the format codes d, u, o, x and X, and
13440 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13441 * Python's regular ints.
13442 * Return value: a new PyUnicodeObject*, or NULL if error.
13443 * The output string is of the form
13444 * "-"? ("0x" | "0X")? digit+
13445 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13446 * set in flags. The case of hex digits will be correct,
13447 * There will be at least prec digits, zero-filled on the left if
13448 * necessary to get that many.
13449 * val object to be converted
13450 * flags bitmask of format flags; only F_ALT is looked at
13451 * prec minimum number of digits; 0-fill on left if needed
13452 * type a character in [duoxX]; u acts the same as d
13453 *
13454 * CAUTION: o, x and X conversions on regular ints can never
13455 * produce a '-' sign, but can for Python's unbounded ints.
13456 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013457static PyObject*
13458formatlong(PyObject *val, int flags, int prec, int type)
13459{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013460 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013461 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013462 Py_ssize_t i;
13463 int sign; /* 1 if '-', else 0 */
13464 int len; /* number of characters */
13465 Py_ssize_t llen;
13466 int numdigits; /* len == numnondigits + numdigits */
13467 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013468
Victor Stinnerd0880d52012-04-27 23:40:13 +020013469 /* Avoid exceeding SSIZE_T_MAX */
13470 if (prec > INT_MAX-3) {
13471 PyErr_SetString(PyExc_OverflowError,
13472 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013473 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013474 }
13475
13476 assert(PyLong_Check(val));
13477
13478 switch (type) {
13479 case 'd':
13480 case 'u':
13481 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013482 if (PyBool_Check(val))
13483 result = PyNumber_ToBase(val, 10);
13484 else
13485 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013486 break;
13487 case 'o':
13488 numnondigits = 2;
13489 result = PyNumber_ToBase(val, 8);
13490 break;
13491 case 'x':
13492 case 'X':
13493 numnondigits = 2;
13494 result = PyNumber_ToBase(val, 16);
13495 break;
13496 default:
13497 assert(!"'type' not in [duoxX]");
13498 }
13499 if (!result)
13500 return NULL;
13501
13502 assert(unicode_modifiable(result));
13503 assert(PyUnicode_IS_READY(result));
13504 assert(PyUnicode_IS_ASCII(result));
13505
13506 /* To modify the string in-place, there can only be one reference. */
13507 if (Py_REFCNT(result) != 1) {
13508 PyErr_BadInternalCall();
13509 return NULL;
13510 }
13511 buf = PyUnicode_DATA(result);
13512 llen = PyUnicode_GET_LENGTH(result);
13513 if (llen > INT_MAX) {
13514 PyErr_SetString(PyExc_ValueError,
13515 "string too large in _PyBytes_FormatLong");
13516 return NULL;
13517 }
13518 len = (int)llen;
13519 sign = buf[0] == '-';
13520 numnondigits += sign;
13521 numdigits = len - numnondigits;
13522 assert(numdigits > 0);
13523
13524 /* Get rid of base marker unless F_ALT */
13525 if (((flags & F_ALT) == 0 &&
13526 (type == 'o' || type == 'x' || type == 'X'))) {
13527 assert(buf[sign] == '0');
13528 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13529 buf[sign+1] == 'o');
13530 numnondigits -= 2;
13531 buf += 2;
13532 len -= 2;
13533 if (sign)
13534 buf[0] = '-';
13535 assert(len == numnondigits + numdigits);
13536 assert(numdigits > 0);
13537 }
13538
13539 /* Fill with leading zeroes to meet minimum width. */
13540 if (prec > numdigits) {
13541 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13542 numnondigits + prec);
13543 char *b1;
13544 if (!r1) {
13545 Py_DECREF(result);
13546 return NULL;
13547 }
13548 b1 = PyBytes_AS_STRING(r1);
13549 for (i = 0; i < numnondigits; ++i)
13550 *b1++ = *buf++;
13551 for (i = 0; i < prec - numdigits; i++)
13552 *b1++ = '0';
13553 for (i = 0; i < numdigits; i++)
13554 *b1++ = *buf++;
13555 *b1 = '\0';
13556 Py_DECREF(result);
13557 result = r1;
13558 buf = PyBytes_AS_STRING(result);
13559 len = numnondigits + prec;
13560 }
13561
13562 /* Fix up case for hex conversions. */
13563 if (type == 'X') {
13564 /* Need to convert all lower case letters to upper case.
13565 and need to convert 0x to 0X (and -0x to -0X). */
13566 for (i = 0; i < len; i++)
13567 if (buf[i] >= 'a' && buf[i] <= 'x')
13568 buf[i] -= 'a'-'A';
13569 }
13570 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13571 PyObject *unicode;
13572 unicode = unicode_fromascii((unsigned char *)buf, len);
13573 Py_DECREF(result);
13574 result = unicode;
13575 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013576 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013577}
13578
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013579static Py_UCS4
13580formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013581{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013582 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013583 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013584 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013585 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013587 goto onError;
13588 }
13589 else {
13590 /* Integer input truncated to a character */
13591 long x;
13592 x = PyLong_AsLong(v);
13593 if (x == -1 && PyErr_Occurred())
13594 goto onError;
13595
Victor Stinner8faf8212011-12-08 22:14:11 +010013596 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 PyErr_SetString(PyExc_OverflowError,
13598 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013599 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013600 }
13601
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013602 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013603 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013604
Benjamin Peterson29060642009-01-31 22:14:21 +000013605 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013606 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013608 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013609}
13610
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013611static int
13612repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13613{
13614 int r;
13615 assert(count > 0);
13616 assert(PyUnicode_Check(obj));
13617 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013618 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013619 if (repeated == NULL)
13620 return -1;
13621 r = _PyAccu_Accumulate(acc, repeated);
13622 Py_DECREF(repeated);
13623 return r;
13624 }
13625 else {
13626 do {
13627 if (_PyAccu_Accumulate(acc, obj))
13628 return -1;
13629 } while (--count);
13630 return 0;
13631 }
13632}
13633
Alexander Belopolsky40018472011-02-26 01:02:56 +000013634PyObject *
13635PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013637 void *fmt;
13638 int fmtkind;
13639 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013640 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013641 int r;
13642 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013643 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013644 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013645 PyObject *temp = NULL;
13646 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013647 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013648 _PyAccu acc;
13649 static PyObject *plus, *minus, *blank, *zero, *percent;
13650
13651 if (!plus && !(plus = get_latin1_char('+')))
13652 return NULL;
13653 if (!minus && !(minus = get_latin1_char('-')))
13654 return NULL;
13655 if (!blank && !(blank = get_latin1_char(' ')))
13656 return NULL;
13657 if (!zero && !(zero = get_latin1_char('0')))
13658 return NULL;
13659 if (!percent && !(percent = get_latin1_char('%')))
13660 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013661
Guido van Rossumd57fd912000-03-10 22:53:23 +000013662 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013663 PyErr_BadInternalCall();
13664 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013665 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013666 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013667 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013668 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013669 if (PyUnicode_READY(uformat) == -1)
13670 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013671 if (_PyAccu_Init(&acc))
13672 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013673 fmt = PyUnicode_DATA(uformat);
13674 fmtkind = PyUnicode_KIND(uformat);
13675 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13676 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013677
Guido van Rossumd57fd912000-03-10 22:53:23 +000013678 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013679 arglen = PyTuple_Size(args);
13680 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013681 }
13682 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013683 arglen = -1;
13684 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013685 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013686 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013687 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013688 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013689
13690 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013691 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013692 PyObject *nonfmt;
13693 Py_ssize_t nonfmtpos;
13694 nonfmtpos = fmtpos++;
13695 while (fmtcnt >= 0 &&
13696 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13697 fmtpos++;
13698 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013699 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013700 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013701 if (nonfmt == NULL)
13702 goto onError;
13703 r = _PyAccu_Accumulate(&acc, nonfmt);
13704 Py_DECREF(nonfmt);
13705 if (r)
13706 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013707 }
13708 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013709 /* Got a format specifier */
13710 int flags = 0;
13711 Py_ssize_t width = -1;
13712 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013713 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013714 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013715 int isnumok;
13716 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013717 void *pbuf = NULL;
13718 Py_ssize_t pindex, len;
13719 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013721 fmtpos++;
13722 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13723 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013724 Py_ssize_t keylen;
13725 PyObject *key;
13726 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013727
Benjamin Peterson29060642009-01-31 22:14:21 +000013728 if (dict == NULL) {
13729 PyErr_SetString(PyExc_TypeError,
13730 "format requires a mapping");
13731 goto onError;
13732 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013733 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013734 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013735 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013736 /* Skip over balanced parentheses */
13737 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013738 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013739 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013740 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013741 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013742 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013744 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013745 if (fmtcnt < 0 || pcount > 0) {
13746 PyErr_SetString(PyExc_ValueError,
13747 "incomplete format key");
13748 goto onError;
13749 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013750 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013751 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013752 if (key == NULL)
13753 goto onError;
13754 if (args_owned) {
13755 Py_DECREF(args);
13756 args_owned = 0;
13757 }
13758 args = PyObject_GetItem(dict, key);
13759 Py_DECREF(key);
13760 if (args == NULL) {
13761 goto onError;
13762 }
13763 args_owned = 1;
13764 arglen = -1;
13765 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013766 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013767 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013768 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013769 case '-': flags |= F_LJUST; continue;
13770 case '+': flags |= F_SIGN; continue;
13771 case ' ': flags |= F_BLANK; continue;
13772 case '#': flags |= F_ALT; continue;
13773 case '0': flags |= F_ZERO; continue;
13774 }
13775 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013776 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013777 if (c == '*') {
13778 v = getnextarg(args, arglen, &argidx);
13779 if (v == NULL)
13780 goto onError;
13781 if (!PyLong_Check(v)) {
13782 PyErr_SetString(PyExc_TypeError,
13783 "* wants int");
13784 goto onError;
13785 }
13786 width = PyLong_AsLong(v);
13787 if (width == -1 && PyErr_Occurred())
13788 goto onError;
13789 if (width < 0) {
13790 flags |= F_LJUST;
13791 width = -width;
13792 }
13793 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013794 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013795 }
13796 else if (c >= '0' && c <= '9') {
13797 width = c - '0';
13798 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013799 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013800 if (c < '0' || c > '9')
13801 break;
13802 if ((width*10) / 10 != width) {
13803 PyErr_SetString(PyExc_ValueError,
13804 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013805 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013806 }
13807 width = width*10 + (c - '0');
13808 }
13809 }
13810 if (c == '.') {
13811 prec = 0;
13812 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013813 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013814 if (c == '*') {
13815 v = getnextarg(args, arglen, &argidx);
13816 if (v == NULL)
13817 goto onError;
13818 if (!PyLong_Check(v)) {
13819 PyErr_SetString(PyExc_TypeError,
13820 "* wants int");
13821 goto onError;
13822 }
13823 prec = PyLong_AsLong(v);
13824 if (prec == -1 && PyErr_Occurred())
13825 goto onError;
13826 if (prec < 0)
13827 prec = 0;
13828 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013829 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013830 }
13831 else if (c >= '0' && c <= '9') {
13832 prec = c - '0';
13833 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013834 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013835 if (c < '0' || c > '9')
13836 break;
13837 if ((prec*10) / 10 != prec) {
13838 PyErr_SetString(PyExc_ValueError,
13839 "prec too big");
13840 goto onError;
13841 }
13842 prec = prec*10 + (c - '0');
13843 }
13844 }
13845 } /* prec */
13846 if (fmtcnt >= 0) {
13847 if (c == 'h' || c == 'l' || c == 'L') {
13848 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013849 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 }
13851 }
13852 if (fmtcnt < 0) {
13853 PyErr_SetString(PyExc_ValueError,
13854 "incomplete format");
13855 goto onError;
13856 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013857
13858 if (c == '%') {
13859 _PyAccu_Accumulate(&acc, percent);
13860 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013861 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013862
13863
13864 v = getnextarg(args, arglen, &argidx);
13865 if (v == NULL)
13866 goto onError;
13867
Benjamin Peterson29060642009-01-31 22:14:21 +000013868 sign = 0;
13869 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013870 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013871 switch (c) {
13872
Benjamin Peterson29060642009-01-31 22:14:21 +000013873 case 's':
13874 case 'r':
13875 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013876 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013877 temp = v;
13878 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013879 }
13880 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013881 if (c == 's')
13882 temp = PyObject_Str(v);
13883 else if (c == 'r')
13884 temp = PyObject_Repr(v);
13885 else
13886 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013887 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013888 break;
13889
13890 case 'i':
13891 case 'd':
13892 case 'u':
13893 case 'o':
13894 case 'x':
13895 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013896 isnumok = 0;
13897 if (PyNumber_Check(v)) {
13898 PyObject *iobj=NULL;
13899
13900 if (PyLong_Check(v)) {
13901 iobj = v;
13902 Py_INCREF(iobj);
13903 }
13904 else {
13905 iobj = PyNumber_Long(v);
13906 }
13907 if (iobj!=NULL) {
13908 if (PyLong_Check(iobj)) {
13909 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013910 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013911 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013912 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013913 }
13914 else {
13915 Py_DECREF(iobj);
13916 }
13917 }
13918 }
13919 if (!isnumok) {
13920 PyErr_Format(PyExc_TypeError,
13921 "%%%c format: a number is required, "
13922 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13923 goto onError;
13924 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013925 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013926 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013927 fillobj = zero;
13928 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013929 break;
13930
13931 case 'e':
13932 case 'E':
13933 case 'f':
13934 case 'F':
13935 case 'g':
13936 case 'G':
Benjamin Peterson29060642009-01-31 22:14:21 +000013937 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013938 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013939 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013940 fillobj = zero;
13941 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013942 temp = formatfloat(v, flags, prec, c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013943 break;
13944
13945 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013946 {
13947 Py_UCS4 ch = formatchar(v);
13948 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013949 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013950 temp = _PyUnicode_FromUCS4(&ch, 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013951 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013952 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013953
13954 default:
13955 PyErr_Format(PyExc_ValueError,
13956 "unsupported format character '%c' (0x%x) "
13957 "at index %zd",
13958 (31<=c && c<=126) ? (char)c : '?',
13959 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013960 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013961 goto onError;
13962 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013963 if (temp == NULL)
13964 goto onError;
13965 assert (PyUnicode_Check(temp));
13966 if (PyUnicode_READY(temp) == -1) {
13967 Py_CLEAR(temp);
13968 goto onError;
13969 }
13970 kind = PyUnicode_KIND(temp);
13971 pbuf = PyUnicode_DATA(temp);
13972 len = PyUnicode_GET_LENGTH(temp);
13973
13974 if (c == 's' || c == 'r' || c == 'a') {
13975 if (prec >= 0 && len > prec)
13976 len = prec;
13977 }
13978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013979 /* pbuf is initialized here. */
13980 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013981 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013982 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13983 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013984 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013985 pindex++;
13986 }
13987 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13988 signobj = plus;
13989 len--;
13990 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013991 }
13992 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013993 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013994 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013995 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013996 else
13997 sign = 0;
13998 }
13999 if (width < len)
14000 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014001 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014002 if (fill != ' ') {
14003 assert(signobj != NULL);
14004 if (_PyAccu_Accumulate(&acc, signobj))
14005 goto onError;
14006 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014007 if (width > len)
14008 width--;
14009 }
14010 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014011 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014012 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000014013 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014014 second = get_latin1_char(
14015 PyUnicode_READ(kind, pbuf, pindex + 1));
14016 pindex += 2;
14017 if (second == NULL ||
14018 _PyAccu_Accumulate(&acc, zero) ||
14019 _PyAccu_Accumulate(&acc, second))
14020 goto onError;
14021 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000014022 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014023 width -= 2;
14024 if (width < 0)
14025 width = 0;
14026 len -= 2;
14027 }
14028 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014029 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014030 if (repeat_accumulate(&acc, fillobj, width - len))
14031 goto onError;
14032 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014033 }
14034 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014035 if (sign) {
14036 assert(signobj != NULL);
14037 if (_PyAccu_Accumulate(&acc, signobj))
14038 goto onError;
14039 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014040 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014041 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14042 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014043 second = get_latin1_char(
14044 PyUnicode_READ(kind, pbuf, pindex + 1));
14045 pindex += 2;
14046 if (second == NULL ||
14047 _PyAccu_Accumulate(&acc, zero) ||
14048 _PyAccu_Accumulate(&acc, second))
14049 goto onError;
14050 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014051 }
14052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014053 /* Copy all characters, preserving len */
Victor Stinnerb80e46e2012-04-30 05:21:52 +020014054 if (pindex == 0 && len == PyUnicode_GET_LENGTH(temp)) {
14055 r = _PyAccu_Accumulate(&acc, temp);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014056 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014057 else {
Victor Stinnerb80e46e2012-04-30 05:21:52 +020014058 v = PyUnicode_Substring(temp, pindex, pindex + len);
14059 if (v == NULL)
14060 goto onError;
14061 r = _PyAccu_Accumulate(&acc, v);
14062 Py_DECREF(v);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014063 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014064 if (r)
14065 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014066 if (width > len && repeat_accumulate(&acc, blank, width - len))
14067 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000014068 if (dict && (argidx < arglen) && c != '%') {
14069 PyErr_SetString(PyExc_TypeError,
14070 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000014071 goto onError;
14072 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014073 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000014074 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014075 } /* until end */
14076 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014077 PyErr_SetString(PyExc_TypeError,
14078 "not all arguments converted during string formatting");
14079 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014080 }
14081
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014082 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014083 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014084 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014085 }
14086 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014087 Py_XDECREF(temp);
14088 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014089 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014090
Benjamin Peterson29060642009-01-31 22:14:21 +000014091 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014092 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014093 Py_XDECREF(temp);
14094 Py_XDECREF(second);
14095 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014096 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014097 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014098 }
14099 return NULL;
14100}
14101
Jeremy Hylton938ace62002-07-17 16:30:39 +000014102static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014103unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14104
Tim Peters6d6c1a32001-08-02 04:15:00 +000014105static PyObject *
14106unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14107{
Benjamin Peterson29060642009-01-31 22:14:21 +000014108 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014109 static char *kwlist[] = {"object", "encoding", "errors", 0};
14110 char *encoding = NULL;
14111 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014112
Benjamin Peterson14339b62009-01-31 16:36:08 +000014113 if (type != &PyUnicode_Type)
14114 return unicode_subtype_new(type, args, kwds);
14115 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014116 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014118 if (x == NULL) {
14119 Py_INCREF(unicode_empty);
14120 return unicode_empty;
14121 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014122 if (encoding == NULL && errors == NULL)
14123 return PyObject_Str(x);
14124 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014125 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014126}
14127
Guido van Rossume023fe02001-08-30 03:12:59 +000014128static PyObject *
14129unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14130{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014131 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014132 Py_ssize_t length, char_size;
14133 int share_wstr, share_utf8;
14134 unsigned int kind;
14135 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014136
Benjamin Peterson14339b62009-01-31 16:36:08 +000014137 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014138
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014139 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014140 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014141 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014142 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014143 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014144 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014145 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014146 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014147
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014148 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014149 if (self == NULL) {
14150 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014151 return NULL;
14152 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014153 kind = PyUnicode_KIND(unicode);
14154 length = PyUnicode_GET_LENGTH(unicode);
14155
14156 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014157#ifdef Py_DEBUG
14158 _PyUnicode_HASH(self) = -1;
14159#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014160 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014161#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014162 _PyUnicode_STATE(self).interned = 0;
14163 _PyUnicode_STATE(self).kind = kind;
14164 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014165 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014166 _PyUnicode_STATE(self).ready = 1;
14167 _PyUnicode_WSTR(self) = NULL;
14168 _PyUnicode_UTF8_LENGTH(self) = 0;
14169 _PyUnicode_UTF8(self) = NULL;
14170 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014171 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014172
14173 share_utf8 = 0;
14174 share_wstr = 0;
14175 if (kind == PyUnicode_1BYTE_KIND) {
14176 char_size = 1;
14177 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14178 share_utf8 = 1;
14179 }
14180 else if (kind == PyUnicode_2BYTE_KIND) {
14181 char_size = 2;
14182 if (sizeof(wchar_t) == 2)
14183 share_wstr = 1;
14184 }
14185 else {
14186 assert(kind == PyUnicode_4BYTE_KIND);
14187 char_size = 4;
14188 if (sizeof(wchar_t) == 4)
14189 share_wstr = 1;
14190 }
14191
14192 /* Ensure we won't overflow the length. */
14193 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14194 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014195 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014196 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014197 data = PyObject_MALLOC((length + 1) * char_size);
14198 if (data == NULL) {
14199 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014200 goto onError;
14201 }
14202
Victor Stinnerc3c74152011-10-02 20:39:55 +020014203 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014204 if (share_utf8) {
14205 _PyUnicode_UTF8_LENGTH(self) = length;
14206 _PyUnicode_UTF8(self) = data;
14207 }
14208 if (share_wstr) {
14209 _PyUnicode_WSTR_LENGTH(self) = length;
14210 _PyUnicode_WSTR(self) = (wchar_t *)data;
14211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014212
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014213 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014214 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014215 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014216#ifdef Py_DEBUG
14217 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14218#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014219 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014220 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014221
14222onError:
14223 Py_DECREF(unicode);
14224 Py_DECREF(self);
14225 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014226}
14227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014228PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014229 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014230\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014231Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014232encoding defaults to the current default string encoding.\n\
14233errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014234
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014235static PyObject *unicode_iter(PyObject *seq);
14236
Guido van Rossumd57fd912000-03-10 22:53:23 +000014237PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014238 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014239 "str", /* tp_name */
14240 sizeof(PyUnicodeObject), /* tp_size */
14241 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014242 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014243 (destructor)unicode_dealloc, /* tp_dealloc */
14244 0, /* tp_print */
14245 0, /* tp_getattr */
14246 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014247 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014248 unicode_repr, /* tp_repr */
14249 &unicode_as_number, /* tp_as_number */
14250 &unicode_as_sequence, /* tp_as_sequence */
14251 &unicode_as_mapping, /* tp_as_mapping */
14252 (hashfunc) unicode_hash, /* tp_hash*/
14253 0, /* tp_call*/
14254 (reprfunc) unicode_str, /* tp_str */
14255 PyObject_GenericGetAttr, /* tp_getattro */
14256 0, /* tp_setattro */
14257 0, /* tp_as_buffer */
14258 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014259 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014260 unicode_doc, /* tp_doc */
14261 0, /* tp_traverse */
14262 0, /* tp_clear */
14263 PyUnicode_RichCompare, /* tp_richcompare */
14264 0, /* tp_weaklistoffset */
14265 unicode_iter, /* tp_iter */
14266 0, /* tp_iternext */
14267 unicode_methods, /* tp_methods */
14268 0, /* tp_members */
14269 0, /* tp_getset */
14270 &PyBaseObject_Type, /* tp_base */
14271 0, /* tp_dict */
14272 0, /* tp_descr_get */
14273 0, /* tp_descr_set */
14274 0, /* tp_dictoffset */
14275 0, /* tp_init */
14276 0, /* tp_alloc */
14277 unicode_new, /* tp_new */
14278 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014279};
14280
14281/* Initialize the Unicode implementation */
14282
Victor Stinner3a50e702011-10-18 21:21:00 +020014283int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014284{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014285 int i;
14286
Thomas Wouters477c8d52006-05-27 19:21:47 +000014287 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014288 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014289 0x000A, /* LINE FEED */
14290 0x000D, /* CARRIAGE RETURN */
14291 0x001C, /* FILE SEPARATOR */
14292 0x001D, /* GROUP SEPARATOR */
14293 0x001E, /* RECORD SEPARATOR */
14294 0x0085, /* NEXT LINE */
14295 0x2028, /* LINE SEPARATOR */
14296 0x2029, /* PARAGRAPH SEPARATOR */
14297 };
14298
Fred Drakee4315f52000-05-09 19:53:39 +000014299 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014300 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014301 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014302 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014303 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014304
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014305 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014306 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014307 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014308 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014309
14310 /* initialize the linebreak bloom filter */
14311 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014312 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014313 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014314
14315 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014316
14317#ifdef HAVE_MBCS
14318 winver.dwOSVersionInfoSize = sizeof(winver);
14319 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14320 PyErr_SetFromWindowsErr(0);
14321 return -1;
14322 }
14323#endif
14324 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014325}
14326
14327/* Finalize the Unicode implementation */
14328
Christian Heimesa156e092008-02-16 07:38:31 +000014329int
14330PyUnicode_ClearFreeList(void)
14331{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014332 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014333}
14334
Guido van Rossumd57fd912000-03-10 22:53:23 +000014335void
Thomas Wouters78890102000-07-22 19:25:51 +000014336_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014337{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014338 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014339
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014340 Py_XDECREF(unicode_empty);
14341 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014342
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014343 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014344 if (unicode_latin1[i]) {
14345 Py_DECREF(unicode_latin1[i]);
14346 unicode_latin1[i] = NULL;
14347 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014348 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014349 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014350 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014351}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014352
Walter Dörwald16807132007-05-25 13:52:07 +000014353void
14354PyUnicode_InternInPlace(PyObject **p)
14355{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014356 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014357 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014358#ifdef Py_DEBUG
14359 assert(s != NULL);
14360 assert(_PyUnicode_CHECK(s));
14361#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014362 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014363 return;
14364#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014365 /* If it's a subclass, we don't really know what putting
14366 it in the interned dict might do. */
14367 if (!PyUnicode_CheckExact(s))
14368 return;
14369 if (PyUnicode_CHECK_INTERNED(s))
14370 return;
14371 if (interned == NULL) {
14372 interned = PyDict_New();
14373 if (interned == NULL) {
14374 PyErr_Clear(); /* Don't leave an exception */
14375 return;
14376 }
14377 }
14378 /* It might be that the GetItem call fails even
14379 though the key is present in the dictionary,
14380 namely when this happens during a stack overflow. */
14381 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014382 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014383 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014384
Benjamin Peterson29060642009-01-31 22:14:21 +000014385 if (t) {
14386 Py_INCREF(t);
14387 Py_DECREF(*p);
14388 *p = t;
14389 return;
14390 }
Walter Dörwald16807132007-05-25 13:52:07 +000014391
Benjamin Peterson14339b62009-01-31 16:36:08 +000014392 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014393 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014394 PyErr_Clear();
14395 PyThreadState_GET()->recursion_critical = 0;
14396 return;
14397 }
14398 PyThreadState_GET()->recursion_critical = 0;
14399 /* The two references in interned are not counted by refcnt.
14400 The deallocator will take care of this */
14401 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014402 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014403}
14404
14405void
14406PyUnicode_InternImmortal(PyObject **p)
14407{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014408 PyUnicode_InternInPlace(p);
14409 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014410 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014411 Py_INCREF(*p);
14412 }
Walter Dörwald16807132007-05-25 13:52:07 +000014413}
14414
14415PyObject *
14416PyUnicode_InternFromString(const char *cp)
14417{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014418 PyObject *s = PyUnicode_FromString(cp);
14419 if (s == NULL)
14420 return NULL;
14421 PyUnicode_InternInPlace(&s);
14422 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014423}
14424
Alexander Belopolsky40018472011-02-26 01:02:56 +000014425void
14426_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014427{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014428 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014429 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014430 Py_ssize_t i, n;
14431 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014432
Benjamin Peterson14339b62009-01-31 16:36:08 +000014433 if (interned == NULL || !PyDict_Check(interned))
14434 return;
14435 keys = PyDict_Keys(interned);
14436 if (keys == NULL || !PyList_Check(keys)) {
14437 PyErr_Clear();
14438 return;
14439 }
Walter Dörwald16807132007-05-25 13:52:07 +000014440
Benjamin Peterson14339b62009-01-31 16:36:08 +000014441 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14442 detector, interned unicode strings are not forcibly deallocated;
14443 rather, we give them their stolen references back, and then clear
14444 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014445
Benjamin Peterson14339b62009-01-31 16:36:08 +000014446 n = PyList_GET_SIZE(keys);
14447 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014448 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014449 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014450 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014451 if (PyUnicode_READY(s) == -1) {
14452 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014453 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014455 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014456 case SSTATE_NOT_INTERNED:
14457 /* XXX Shouldn't happen */
14458 break;
14459 case SSTATE_INTERNED_IMMORTAL:
14460 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014461 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014462 break;
14463 case SSTATE_INTERNED_MORTAL:
14464 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014465 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014466 break;
14467 default:
14468 Py_FatalError("Inconsistent interned string state.");
14469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014470 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014471 }
14472 fprintf(stderr, "total size of all interned strings: "
14473 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14474 "mortal/immortal\n", mortal_size, immortal_size);
14475 Py_DECREF(keys);
14476 PyDict_Clear(interned);
14477 Py_DECREF(interned);
14478 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014479}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014480
14481
14482/********************* Unicode Iterator **************************/
14483
14484typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014485 PyObject_HEAD
14486 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014487 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014488} unicodeiterobject;
14489
14490static void
14491unicodeiter_dealloc(unicodeiterobject *it)
14492{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014493 _PyObject_GC_UNTRACK(it);
14494 Py_XDECREF(it->it_seq);
14495 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014496}
14497
14498static int
14499unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14500{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014501 Py_VISIT(it->it_seq);
14502 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014503}
14504
14505static PyObject *
14506unicodeiter_next(unicodeiterobject *it)
14507{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014508 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014509
Benjamin Peterson14339b62009-01-31 16:36:08 +000014510 assert(it != NULL);
14511 seq = it->it_seq;
14512 if (seq == NULL)
14513 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014514 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014516 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14517 int kind = PyUnicode_KIND(seq);
14518 void *data = PyUnicode_DATA(seq);
14519 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14520 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014521 if (item != NULL)
14522 ++it->it_index;
14523 return item;
14524 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014525
Benjamin Peterson14339b62009-01-31 16:36:08 +000014526 Py_DECREF(seq);
14527 it->it_seq = NULL;
14528 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014529}
14530
14531static PyObject *
14532unicodeiter_len(unicodeiterobject *it)
14533{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014534 Py_ssize_t len = 0;
14535 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014536 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014537 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014538}
14539
14540PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14541
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014542static PyObject *
14543unicodeiter_reduce(unicodeiterobject *it)
14544{
14545 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014546 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014547 it->it_seq, it->it_index);
14548 } else {
14549 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14550 if (u == NULL)
14551 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014552 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014553 }
14554}
14555
14556PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14557
14558static PyObject *
14559unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14560{
14561 Py_ssize_t index = PyLong_AsSsize_t(state);
14562 if (index == -1 && PyErr_Occurred())
14563 return NULL;
14564 if (index < 0)
14565 index = 0;
14566 it->it_index = index;
14567 Py_RETURN_NONE;
14568}
14569
14570PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14571
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014572static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014573 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014574 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014575 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14576 reduce_doc},
14577 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14578 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014579 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014580};
14581
14582PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014583 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14584 "str_iterator", /* tp_name */
14585 sizeof(unicodeiterobject), /* tp_basicsize */
14586 0, /* tp_itemsize */
14587 /* methods */
14588 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14589 0, /* tp_print */
14590 0, /* tp_getattr */
14591 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014592 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014593 0, /* tp_repr */
14594 0, /* tp_as_number */
14595 0, /* tp_as_sequence */
14596 0, /* tp_as_mapping */
14597 0, /* tp_hash */
14598 0, /* tp_call */
14599 0, /* tp_str */
14600 PyObject_GenericGetAttr, /* tp_getattro */
14601 0, /* tp_setattro */
14602 0, /* tp_as_buffer */
14603 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14604 0, /* tp_doc */
14605 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14606 0, /* tp_clear */
14607 0, /* tp_richcompare */
14608 0, /* tp_weaklistoffset */
14609 PyObject_SelfIter, /* tp_iter */
14610 (iternextfunc)unicodeiter_next, /* tp_iternext */
14611 unicodeiter_methods, /* tp_methods */
14612 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014613};
14614
14615static PyObject *
14616unicode_iter(PyObject *seq)
14617{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014618 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014619
Benjamin Peterson14339b62009-01-31 16:36:08 +000014620 if (!PyUnicode_Check(seq)) {
14621 PyErr_BadInternalCall();
14622 return NULL;
14623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014624 if (PyUnicode_READY(seq) == -1)
14625 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014626 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14627 if (it == NULL)
14628 return NULL;
14629 it->it_index = 0;
14630 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014631 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014632 _PyObject_GC_TRACK(it);
14633 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014634}
14635
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014636
14637size_t
14638Py_UNICODE_strlen(const Py_UNICODE *u)
14639{
14640 int res = 0;
14641 while(*u++)
14642 res++;
14643 return res;
14644}
14645
14646Py_UNICODE*
14647Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14648{
14649 Py_UNICODE *u = s1;
14650 while ((*u++ = *s2++));
14651 return s1;
14652}
14653
14654Py_UNICODE*
14655Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14656{
14657 Py_UNICODE *u = s1;
14658 while ((*u++ = *s2++))
14659 if (n-- == 0)
14660 break;
14661 return s1;
14662}
14663
14664Py_UNICODE*
14665Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14666{
14667 Py_UNICODE *u1 = s1;
14668 u1 += Py_UNICODE_strlen(u1);
14669 Py_UNICODE_strcpy(u1, s2);
14670 return s1;
14671}
14672
14673int
14674Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14675{
14676 while (*s1 && *s2 && *s1 == *s2)
14677 s1++, s2++;
14678 if (*s1 && *s2)
14679 return (*s1 < *s2) ? -1 : +1;
14680 if (*s1)
14681 return 1;
14682 if (*s2)
14683 return -1;
14684 return 0;
14685}
14686
14687int
14688Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14689{
14690 register Py_UNICODE u1, u2;
14691 for (; n != 0; n--) {
14692 u1 = *s1;
14693 u2 = *s2;
14694 if (u1 != u2)
14695 return (u1 < u2) ? -1 : +1;
14696 if (u1 == '\0')
14697 return 0;
14698 s1++;
14699 s2++;
14700 }
14701 return 0;
14702}
14703
14704Py_UNICODE*
14705Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14706{
14707 const Py_UNICODE *p;
14708 for (p = s; *p; p++)
14709 if (*p == c)
14710 return (Py_UNICODE*)p;
14711 return NULL;
14712}
14713
14714Py_UNICODE*
14715Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14716{
14717 const Py_UNICODE *p;
14718 p = s + Py_UNICODE_strlen(s);
14719 while (p != s) {
14720 p--;
14721 if (*p == c)
14722 return (Py_UNICODE*)p;
14723 }
14724 return NULL;
14725}
Victor Stinner331ea922010-08-10 16:37:20 +000014726
Victor Stinner71133ff2010-09-01 23:43:53 +000014727Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014728PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014729{
Victor Stinner577db2c2011-10-11 22:12:48 +020014730 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014731 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014733 if (!PyUnicode_Check(unicode)) {
14734 PyErr_BadArgument();
14735 return NULL;
14736 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014737 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014738 if (u == NULL)
14739 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014740 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014741 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014742 PyErr_NoMemory();
14743 return NULL;
14744 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014745 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014746 size *= sizeof(Py_UNICODE);
14747 copy = PyMem_Malloc(size);
14748 if (copy == NULL) {
14749 PyErr_NoMemory();
14750 return NULL;
14751 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014752 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014753 return copy;
14754}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014755
Georg Brandl66c221e2010-10-14 07:04:07 +000014756/* A _string module, to export formatter_parser and formatter_field_name_split
14757 to the string.Formatter class implemented in Python. */
14758
14759static PyMethodDef _string_methods[] = {
14760 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14761 METH_O, PyDoc_STR("split the argument as a field name")},
14762 {"formatter_parser", (PyCFunction) formatter_parser,
14763 METH_O, PyDoc_STR("parse the argument as a format string")},
14764 {NULL, NULL}
14765};
14766
14767static struct PyModuleDef _string_module = {
14768 PyModuleDef_HEAD_INIT,
14769 "_string",
14770 PyDoc_STR("string helper module"),
14771 0,
14772 _string_methods,
14773 NULL,
14774 NULL,
14775 NULL,
14776 NULL
14777};
14778
14779PyMODINIT_FUNC
14780PyInit__string(void)
14781{
14782 return PyModule_Create(&_string_module);
14783}
14784
14785
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014786#ifdef __cplusplus
14787}
14788#endif