blob: ab4559f3b02d94b7d396d5e12fc1793d2254ddbe [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490static PyObject*
491unicode_result_unchanged(PyObject *unicode)
492{
493 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500494 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100495 return NULL;
496 Py_INCREF(unicode);
497 return unicode;
498 }
499 else
500 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100501 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100502}
503
Victor Stinner3a50e702011-10-18 21:21:00 +0200504#ifdef HAVE_MBCS
505static OSVERSIONINFOEX winver;
506#endif
507
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508/* --- Bloom Filters ----------------------------------------------------- */
509
510/* stuff to implement simple "bloom filters" for Unicode characters.
511 to keep things simple, we use a single bitmask, using the least 5
512 bits from each unicode characters as the bit index. */
513
514/* the linebreak mask is set up by Unicode_Init below */
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#if LONG_BIT >= 128
517#define BLOOM_WIDTH 128
518#elif LONG_BIT >= 64
519#define BLOOM_WIDTH 64
520#elif LONG_BIT >= 32
521#define BLOOM_WIDTH 32
522#else
523#error "LONG_BIT is smaller than 32"
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526#define BLOOM_MASK unsigned long
527
528static BLOOM_MASK bloom_linebreak;
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
531#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533#define BLOOM_LINEBREAK(ch) \
534 ((ch) < 128U ? ascii_linebreak[(ch)] : \
535 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Alexander Belopolsky40018472011-02-26 01:02:56 +0000537Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539{
540 /* calculate simple bloom-style bitmask for a given unicode string */
541
Antoine Pitrouf068f942010-01-13 14:19:12 +0000542 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543 Py_ssize_t i;
544
545 mask = 0;
546 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
549 return mask;
550}
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552#define BLOOM_MEMBER(mask, chr, str) \
553 (BLOOM(mask, chr) \
554 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200556/* Compilation of templated routines */
557
558#include "stringlib/asciilib.h"
559#include "stringlib/fastsearch.h"
560#include "stringlib/partition.h"
561#include "stringlib/split.h"
562#include "stringlib/count.h"
563#include "stringlib/find.h"
564#include "stringlib/find_max_char.h"
565#include "stringlib/localeutil.h"
566#include "stringlib/undef.h"
567
568#include "stringlib/ucs1lib.h"
569#include "stringlib/fastsearch.h"
570#include "stringlib/partition.h"
571#include "stringlib/split.h"
572#include "stringlib/count.h"
573#include "stringlib/find.h"
574#include "stringlib/find_max_char.h"
575#include "stringlib/localeutil.h"
576#include "stringlib/undef.h"
577
578#include "stringlib/ucs2lib.h"
579#include "stringlib/fastsearch.h"
580#include "stringlib/partition.h"
581#include "stringlib/split.h"
582#include "stringlib/count.h"
583#include "stringlib/find.h"
584#include "stringlib/find_max_char.h"
585#include "stringlib/localeutil.h"
586#include "stringlib/undef.h"
587
588#include "stringlib/ucs4lib.h"
589#include "stringlib/fastsearch.h"
590#include "stringlib/partition.h"
591#include "stringlib/split.h"
592#include "stringlib/count.h"
593#include "stringlib/find.h"
594#include "stringlib/find_max_char.h"
595#include "stringlib/localeutil.h"
596#include "stringlib/undef.h"
597
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598#include "stringlib/unicodedefs.h"
599#include "stringlib/fastsearch.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100602#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604/* --- Unicode Object ----------------------------------------------------- */
605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200607fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
610 Py_ssize_t size, Py_UCS4 ch,
611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
614
615 switch (kind) {
616 case PyUnicode_1BYTE_KIND:
617 {
618 Py_UCS1 ch1 = (Py_UCS1) ch;
619 if (ch1 == ch)
620 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
621 else
622 return -1;
623 }
624 case PyUnicode_2BYTE_KIND:
625 {
626 Py_UCS2 ch2 = (Py_UCS2) ch;
627 if (ch2 == ch)
628 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
629 else
630 return -1;
631 }
632 case PyUnicode_4BYTE_KIND:
633 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
634 default:
635 assert(0);
636 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638}
639
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640static PyObject*
641resize_compact(PyObject *unicode, Py_ssize_t length)
642{
643 Py_ssize_t char_size;
644 Py_ssize_t struct_size;
645 Py_ssize_t new_size;
646 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100647 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100649 assert(PyUnicode_IS_COMPACT(unicode));
650
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200651 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100652 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 struct_size = sizeof(PyASCIIObject);
654 else
655 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200656 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100669 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100691 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 if (PyUnicode_IS_READY(unicode)) {
696 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200701 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200702 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
703 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704
705 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
706 PyErr_NoMemory();
707 return -1;
708 }
709 new_size = (length + 1) * char_size;
710
Victor Stinner7a9105a2011-12-12 00:13:42 +0100711 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
712 {
713 PyObject_DEL(_PyUnicode_UTF8(unicode));
714 _PyUnicode_UTF8(unicode) = NULL;
715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
716 }
717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100746 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200747 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100748 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200749 if (!wstr) {
750 PyErr_NoMemory();
751 return -1;
752 }
753 _PyUnicode_WSTR(unicode) = wstr;
754 _PyUnicode_WSTR(unicode)[length] = 0;
755 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 return 0;
758}
759
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760static PyObject*
761resize_copy(PyObject *unicode, Py_ssize_t length)
762{
763 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100766
Benjamin Petersonbac79492012-01-14 13:34:47 -0500767 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769
770 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
771 if (copy == NULL)
772 return NULL;
773
774 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200775 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 }
778 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100837 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000838 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100839 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841
Jeremy Hyltond8082792003-09-16 19:41:39 +0000842 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000843 * the caller fails before initializing str -- unicode_resize()
844 * reads str[0], and the Keep-Alive optimization can keep memory
845 * allocated for str alive across a call to unicode_dealloc(unicode).
846 * We don't want unicode_resize to read uninitialized memory in
847 * that case.
848 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849 _PyUnicode_WSTR(unicode)[0] = 0;
850 _PyUnicode_WSTR(unicode)[length] = 0;
851 _PyUnicode_WSTR_LENGTH(unicode) = length;
852 _PyUnicode_HASH(unicode) = -1;
853 _PyUnicode_STATE(unicode).interned = 0;
854 _PyUnicode_STATE(unicode).kind = 0;
855 _PyUnicode_STATE(unicode).compact = 0;
856 _PyUnicode_STATE(unicode).ready = 0;
857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200858 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200860 _PyUnicode_UTF8(unicode) = NULL;
861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100862 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000863 return unicode;
864}
865
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866static const char*
867unicode_kind_name(PyObject *unicode)
868{
Victor Stinner42dfd712011-10-03 14:41:45 +0200869 /* don't check consistency: unicode_kind_name() is called from
870 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 if (!PyUnicode_IS_COMPACT(unicode))
872 {
873 if (!PyUnicode_IS_READY(unicode))
874 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600875 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 {
877 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200878 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200879 return "legacy ascii";
880 else
881 return "legacy latin1";
882 case PyUnicode_2BYTE_KIND:
883 return "legacy UCS2";
884 case PyUnicode_4BYTE_KIND:
885 return "legacy UCS4";
886 default:
887 return "<legacy invalid kind>";
888 }
889 }
890 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
Victor Stinner15e9ed22012-02-22 13:36:20 +01001001 assert(maxchar <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 kind_state = PyUnicode_4BYTE_KIND;
1003 char_size = 4;
1004 if (sizeof(wchar_t) == 4)
1005 is_sharing = 1;
1006 }
1007
1008 /* Ensure we won't overflow the size. */
1009 if (size < 0) {
1010 PyErr_SetString(PyExc_SystemError,
1011 "Negative size passed to PyUnicode_New");
1012 return NULL;
1013 }
1014 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1015 return PyErr_NoMemory();
1016
1017 /* Duplicated allocation code from _PyObject_New() instead of a call to
1018 * PyObject_New() so we are able to allocate space for the object and
1019 * it's data buffer.
1020 */
1021 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1022 if (obj == NULL)
1023 return PyErr_NoMemory();
1024 obj = PyObject_INIT(obj, &PyUnicode_Type);
1025 if (obj == NULL)
1026 return NULL;
1027
1028 unicode = (PyCompactUnicodeObject *)obj;
1029 if (is_ascii)
1030 data = ((PyASCIIObject*)obj) + 1;
1031 else
1032 data = unicode + 1;
1033 _PyUnicode_LENGTH(unicode) = size;
1034 _PyUnicode_HASH(unicode) = -1;
1035 _PyUnicode_STATE(unicode).interned = 0;
1036 _PyUnicode_STATE(unicode).kind = kind_state;
1037 _PyUnicode_STATE(unicode).compact = 1;
1038 _PyUnicode_STATE(unicode).ready = 1;
1039 _PyUnicode_STATE(unicode).ascii = is_ascii;
1040 if (is_ascii) {
1041 ((char*)data)[size] = 0;
1042 _PyUnicode_WSTR(unicode) = NULL;
1043 }
1044 else if (kind_state == PyUnicode_1BYTE_KIND) {
1045 ((char*)data)[size] = 0;
1046 _PyUnicode_WSTR(unicode) = NULL;
1047 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001049 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 }
1051 else {
1052 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001053 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 if (kind_state == PyUnicode_2BYTE_KIND)
1055 ((Py_UCS2*)data)[size] = 0;
1056 else /* kind_state == PyUnicode_4BYTE_KIND */
1057 ((Py_UCS4*)data)[size] = 0;
1058 if (is_sharing) {
1059 _PyUnicode_WSTR_LENGTH(unicode) = size;
1060 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1061 }
1062 else {
1063 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1064 _PyUnicode_WSTR(unicode) = NULL;
1065 }
1066 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001067 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 return obj;
1069}
1070
1071#if SIZEOF_WCHAR_T == 2
1072/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1073 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001074 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075
1076 This function assumes that unicode can hold one more code point than wstr
1077 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001078static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001080 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081{
1082 const wchar_t *iter;
1083 Py_UCS4 *ucs4_out;
1084
Victor Stinner910337b2011-10-03 03:20:16 +02001085 assert(unicode != NULL);
1086 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1088 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1089
1090 for (iter = begin; iter < end; ) {
1091 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1092 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001093 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1094 && (iter+1) < end
1095 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096 {
Victor Stinner551ac952011-11-29 22:58:13 +01001097 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 iter += 2;
1099 }
1100 else {
1101 *ucs4_out++ = *iter;
1102 iter++;
1103 }
1104 }
1105 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1106 _PyUnicode_GET_LENGTH(unicode)));
1107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108}
1109#endif
1110
Victor Stinnercd9950f2011-10-02 00:34:53 +02001111static int
Victor Stinner488fa492011-12-12 00:01:39 +01001112unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001113{
Victor Stinner488fa492011-12-12 00:01:39 +01001114 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001115 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001116 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001117 return -1;
1118 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001119 return 0;
1120}
1121
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001122static int
1123_copy_characters(PyObject *to, Py_ssize_t to_start,
1124 PyObject *from, Py_ssize_t from_start,
1125 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001127 unsigned int from_kind, to_kind;
1128 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001129 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001131 assert(PyUnicode_Check(from));
1132 assert(PyUnicode_Check(to));
1133 assert(PyUnicode_IS_READY(from));
1134 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001136 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1137 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1138 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001140 if (how_many == 0)
1141 return 0;
1142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001144 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001146 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001148#ifdef Py_DEBUG
1149 if (!check_maxchar
1150 && (from_kind > to_kind
1151 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001153 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1154 Py_UCS4 ch;
1155 Py_ssize_t i;
1156 for (i=0; i < how_many; i++) {
1157 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1158 assert(ch <= to_maxchar);
1159 }
1160 }
1161#endif
1162 fast = (from_kind == to_kind);
1163 if (check_maxchar
1164 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1165 {
1166 /* deny latin1 => ascii */
1167 fast = 0;
1168 }
1169
1170 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001171 Py_MEMCPY((char*)to_data + to_kind * to_start,
1172 (char*)from_data + from_kind * from_start,
1173 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001175 else if (from_kind == PyUnicode_1BYTE_KIND
1176 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001177 {
1178 _PyUnicode_CONVERT_BYTES(
1179 Py_UCS1, Py_UCS2,
1180 PyUnicode_1BYTE_DATA(from) + from_start,
1181 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1182 PyUnicode_2BYTE_DATA(to) + to_start
1183 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001184 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001185 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001186 && to_kind == PyUnicode_4BYTE_KIND)
1187 {
1188 _PyUnicode_CONVERT_BYTES(
1189 Py_UCS1, Py_UCS4,
1190 PyUnicode_1BYTE_DATA(from) + from_start,
1191 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1192 PyUnicode_4BYTE_DATA(to) + to_start
1193 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001194 }
1195 else if (from_kind == PyUnicode_2BYTE_KIND
1196 && to_kind == PyUnicode_4BYTE_KIND)
1197 {
1198 _PyUnicode_CONVERT_BYTES(
1199 Py_UCS2, Py_UCS4,
1200 PyUnicode_2BYTE_DATA(from) + from_start,
1201 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1202 PyUnicode_4BYTE_DATA(to) + to_start
1203 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001204 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001205 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001206 /* check if max_char(from substring) <= max_char(to) */
1207 if (from_kind > to_kind
1208 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001209 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001210 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001211 /* slow path to check for character overflow */
1212 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001213 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001214 Py_ssize_t i;
1215
Victor Stinner56c161a2011-10-06 02:47:11 +02001216#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001217 for (i=0; i < how_many; i++) {
1218 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001219 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1221 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001222#else
1223 if (!check_maxchar) {
1224 for (i=0; i < how_many; i++) {
1225 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1226 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1227 }
1228 }
1229 else {
1230 for (i=0; i < how_many; i++) {
1231 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1232 if (ch > to_maxchar)
1233 return 1;
1234 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1235 }
1236 }
1237#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001238 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001239 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001240 assert(0 && "inconsistent state");
1241 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001242 }
1243 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001244 return 0;
1245}
1246
1247static void
1248copy_characters(PyObject *to, Py_ssize_t to_start,
1249 PyObject *from, Py_ssize_t from_start,
1250 Py_ssize_t how_many)
1251{
1252 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1253}
1254
1255Py_ssize_t
1256PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1257 PyObject *from, Py_ssize_t from_start,
1258 Py_ssize_t how_many)
1259{
1260 int err;
1261
1262 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1263 PyErr_BadInternalCall();
1264 return -1;
1265 }
1266
Benjamin Petersonbac79492012-01-14 13:34:47 -05001267 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001268 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001269 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001270 return -1;
1271
1272 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1273 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1274 PyErr_Format(PyExc_SystemError,
1275 "Cannot write %zi characters at %zi "
1276 "in a string of %zi characters",
1277 how_many, to_start, PyUnicode_GET_LENGTH(to));
1278 return -1;
1279 }
1280
1281 if (how_many == 0)
1282 return 0;
1283
Victor Stinner488fa492011-12-12 00:01:39 +01001284 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001285 return -1;
1286
1287 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1288 if (err) {
1289 PyErr_Format(PyExc_SystemError,
1290 "Cannot copy %s characters "
1291 "into a string of %s characters",
1292 unicode_kind_name(from),
1293 unicode_kind_name(to));
1294 return -1;
1295 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001296 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297}
1298
Victor Stinner17222162011-09-28 22:15:37 +02001299/* Find the maximum code point and count the number of surrogate pairs so a
1300 correct string length can be computed before converting a string to UCS4.
1301 This function counts single surrogates as a character and not as a pair.
1302
1303 Return 0 on success, or -1 on error. */
1304static int
1305find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1306 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307{
1308 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001309 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310
Victor Stinnerc53be962011-10-02 21:33:54 +02001311 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 *num_surrogates = 0;
1313 *maxchar = 0;
1314
1315 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001317 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1318 && (iter+1) < end
1319 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001321 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 iter += 2;
1324 }
1325 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001327 {
1328 ch = *iter;
1329 iter++;
1330 }
1331 if (ch > *maxchar) {
1332 *maxchar = ch;
1333 if (*maxchar > MAX_UNICODE) {
1334 PyErr_Format(PyExc_ValueError,
1335 "character U+%x is not in range [U+0000; U+10ffff]",
1336 ch);
1337 return -1;
1338 }
1339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340 }
1341 return 0;
1342}
1343
1344#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001345static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346#endif
1347
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001348int
1349_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350{
1351 wchar_t *end;
1352 Py_UCS4 maxchar = 0;
1353 Py_ssize_t num_surrogates;
1354#if SIZEOF_WCHAR_T == 2
1355 Py_ssize_t length_wo_surrogates;
1356#endif
1357
Georg Brandl7597add2011-10-05 16:36:47 +02001358 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001359 strings were created using _PyObject_New() and where no canonical
1360 representation (the str field) has been set yet aka strings
1361 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001362 assert(_PyUnicode_CHECK(unicode));
1363 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001365 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001366 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001367 /* Actually, it should neither be interned nor be anything else: */
1368 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369
1370#ifdef Py_DEBUG
1371 ++unicode_ready_calls;
1372#endif
1373
1374 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001375 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001376 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378
1379 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001380 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1381 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 PyErr_NoMemory();
1383 return -1;
1384 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001385 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 _PyUnicode_WSTR(unicode), end,
1387 PyUnicode_1BYTE_DATA(unicode));
1388 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1389 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1390 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1391 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001392 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001393 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001394 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 }
1396 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001397 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001398 _PyUnicode_UTF8(unicode) = NULL;
1399 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 }
1401 PyObject_FREE(_PyUnicode_WSTR(unicode));
1402 _PyUnicode_WSTR(unicode) = NULL;
1403 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1404 }
1405 /* In this case we might have to convert down from 4-byte native
1406 wchar_t to 2-byte unicode. */
1407 else if (maxchar < 65536) {
1408 assert(num_surrogates == 0 &&
1409 "FindMaxCharAndNumSurrogatePairs() messed up");
1410
Victor Stinner506f5922011-09-28 22:34:18 +02001411#if SIZEOF_WCHAR_T == 2
1412 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001413 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001414 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1415 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1416 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 _PyUnicode_UTF8(unicode) = NULL;
1418 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001419#else
1420 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001421 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001422 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001423 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001424 PyErr_NoMemory();
1425 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 }
Victor Stinner506f5922011-09-28 22:34:18 +02001427 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1428 _PyUnicode_WSTR(unicode), end,
1429 PyUnicode_2BYTE_DATA(unicode));
1430 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1431 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1432 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001433 _PyUnicode_UTF8(unicode) = NULL;
1434 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001435 PyObject_FREE(_PyUnicode_WSTR(unicode));
1436 _PyUnicode_WSTR(unicode) = NULL;
1437 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1438#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 }
1440 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1441 else {
1442#if SIZEOF_WCHAR_T == 2
1443 /* in case the native representation is 2-bytes, we need to allocate a
1444 new normalized 4-byte version. */
1445 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001446 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1447 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 PyErr_NoMemory();
1449 return -1;
1450 }
1451 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1452 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001453 _PyUnicode_UTF8(unicode) = NULL;
1454 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001455 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1456 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001457 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 PyObject_FREE(_PyUnicode_WSTR(unicode));
1459 _PyUnicode_WSTR(unicode) = NULL;
1460 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1461#else
1462 assert(num_surrogates == 0);
1463
Victor Stinnerc3c74152011-10-02 20:39:55 +02001464 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001466 _PyUnicode_UTF8(unicode) = NULL;
1467 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1469#endif
1470 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1471 }
1472 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001473 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 return 0;
1475}
1476
Alexander Belopolsky40018472011-02-26 01:02:56 +00001477static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001478unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479{
Walter Dörwald16807132007-05-25 13:52:07 +00001480 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001481 case SSTATE_NOT_INTERNED:
1482 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001483
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 case SSTATE_INTERNED_MORTAL:
1485 /* revive dead object temporarily for DelItem */
1486 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001487 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 Py_FatalError(
1489 "deletion of interned string failed");
1490 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001491
Benjamin Peterson29060642009-01-31 22:14:21 +00001492 case SSTATE_INTERNED_IMMORTAL:
1493 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001494
Benjamin Peterson29060642009-01-31 22:14:21 +00001495 default:
1496 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001497 }
1498
Victor Stinner03490912011-10-03 23:45:12 +02001499 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001501 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001502 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001503 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1504 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001506 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507}
1508
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001509#ifdef Py_DEBUG
1510static int
1511unicode_is_singleton(PyObject *unicode)
1512{
1513 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1514 if (unicode == unicode_empty)
1515 return 1;
1516 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1517 {
1518 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1519 if (ch < 256 && unicode_latin1[ch] == unicode)
1520 return 1;
1521 }
1522 return 0;
1523}
1524#endif
1525
Alexander Belopolsky40018472011-02-26 01:02:56 +00001526static int
Victor Stinner488fa492011-12-12 00:01:39 +01001527unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001528{
Victor Stinner488fa492011-12-12 00:01:39 +01001529 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001530 if (Py_REFCNT(unicode) != 1)
1531 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001532 if (_PyUnicode_HASH(unicode) != -1)
1533 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001534 if (PyUnicode_CHECK_INTERNED(unicode))
1535 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001536 if (!PyUnicode_CheckExact(unicode))
1537 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001538#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 /* singleton refcount is greater than 1 */
1540 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001541#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001542 return 1;
1543}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001544
Victor Stinnerfe226c02011-10-03 03:52:20 +02001545static int
1546unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1547{
1548 PyObject *unicode;
1549 Py_ssize_t old_length;
1550
1551 assert(p_unicode != NULL);
1552 unicode = *p_unicode;
1553
1554 assert(unicode != NULL);
1555 assert(PyUnicode_Check(unicode));
1556 assert(0 <= length);
1557
Victor Stinner910337b2011-10-03 03:20:16 +02001558 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001559 old_length = PyUnicode_WSTR_LENGTH(unicode);
1560 else
1561 old_length = PyUnicode_GET_LENGTH(unicode);
1562 if (old_length == length)
1563 return 0;
1564
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001565 if (length == 0) {
1566 Py_DECREF(*p_unicode);
1567 *p_unicode = unicode_empty;
1568 Py_INCREF(*p_unicode);
1569 return 0;
1570 }
1571
Victor Stinner488fa492011-12-12 00:01:39 +01001572 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001573 PyObject *copy = resize_copy(unicode, length);
1574 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001576 Py_DECREF(*p_unicode);
1577 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001579 }
1580
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001582 PyObject *new_unicode = resize_compact(unicode, length);
1583 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001585 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001586 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001587 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001588 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001589 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001590}
1591
Alexander Belopolsky40018472011-02-26 01:02:56 +00001592int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001593PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001594{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 PyObject *unicode;
1596 if (p_unicode == NULL) {
1597 PyErr_BadInternalCall();
1598 return -1;
1599 }
1600 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001601 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001602 {
1603 PyErr_BadInternalCall();
1604 return -1;
1605 }
1606 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001607}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001608
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001609static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001610unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001611{
1612 PyObject *result;
1613 assert(PyUnicode_IS_READY(*p_unicode));
1614 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1615 return 0;
1616 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1617 maxchar);
1618 if (result == NULL)
1619 return -1;
1620 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1621 PyUnicode_GET_LENGTH(*p_unicode));
1622 Py_DECREF(*p_unicode);
1623 *p_unicode = result;
1624 return 0;
1625}
1626
1627static int
1628unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1629 Py_UCS4 ch)
1630{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001631 assert(ch <= MAX_UNICODE);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001632 if (unicode_widen(p_unicode, ch) < 0)
1633 return -1;
1634 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1635 PyUnicode_DATA(*p_unicode),
1636 (*pos)++, ch);
1637 return 0;
1638}
1639
Victor Stinnerc5166102012-02-22 13:55:02 +01001640/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1641 Return the length of the input string.
1642
1643 WARNING: Don't copy the terminating null character and don't check the
1644 maximum character (may write a latin1 character in an ASCII string). */
1645static Py_ssize_t
1646unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1647{
1648 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1649 void *data = PyUnicode_DATA(unicode);
1650
1651 switch (kind) {
1652 case PyUnicode_1BYTE_KIND: {
1653 Py_ssize_t len = strlen(str);
1654 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001655 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001656 return len;
1657 }
1658 case PyUnicode_2BYTE_KIND: {
1659 Py_UCS2 *start = (Py_UCS2 *)data + index;
1660 Py_UCS2 *ucs2 = start;
1661 assert(index <= PyUnicode_GET_LENGTH(unicode));
1662
1663 for (; *str; ++ucs2, ++str)
1664 *ucs2 = (Py_UCS2)*str;
1665
1666 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1667 return ucs2 - start;
1668 }
1669 default: {
1670 Py_UCS4 *start = (Py_UCS4 *)data + index;
1671 Py_UCS4 *ucs4 = start;
1672 assert(kind == PyUnicode_4BYTE_KIND);
1673 assert(index <= PyUnicode_GET_LENGTH(unicode));
1674
1675 for (; *str; ++ucs4, ++str)
1676 *ucs4 = (Py_UCS4)*str;
1677
1678 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1679 return ucs4 - start;
1680 }
1681 }
1682}
1683
1684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685static PyObject*
1686get_latin1_char(unsigned char ch)
1687{
Victor Stinnera464fc12011-10-02 20:39:30 +02001688 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001690 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 if (!unicode)
1692 return NULL;
1693 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001694 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695 unicode_latin1[ch] = unicode;
1696 }
1697 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001698 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699}
1700
Alexander Belopolsky40018472011-02-26 01:02:56 +00001701PyObject *
1702PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001704 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 Py_UCS4 maxchar = 0;
1706 Py_ssize_t num_surrogates;
1707
1708 if (u == NULL)
1709 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001711 /* If the Unicode data is known at construction time, we can apply
1712 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714 /* Optimization for empty strings */
1715 if (size == 0 && unicode_empty != NULL) {
1716 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001717 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001718 }
Tim Petersced69f82003-09-16 20:30:58 +00001719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720 /* Single character Unicode objects in the Latin-1 range are
1721 shared when using this constructor */
1722 if (size == 1 && *u < 256)
1723 return get_latin1_char((unsigned char)*u);
1724
1725 /* If not empty and not single character, copy the Unicode data
1726 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001727 if (find_maxchar_surrogates(u, u + size,
1728 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 return NULL;
1730
Victor Stinner8faf8212011-12-08 22:14:11 +01001731 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732 if (!unicode)
1733 return NULL;
1734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 switch (PyUnicode_KIND(unicode)) {
1736 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001737 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1739 break;
1740 case PyUnicode_2BYTE_KIND:
1741#if Py_UNICODE_SIZE == 2
1742 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1743#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001744 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1746#endif
1747 break;
1748 case PyUnicode_4BYTE_KIND:
1749#if SIZEOF_WCHAR_T == 2
1750 /* This is the only case which has to process surrogates, thus
1751 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001752 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753#else
1754 assert(num_surrogates == 0);
1755 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1756#endif
1757 break;
1758 default:
1759 assert(0 && "Impossible state");
1760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001762 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763}
1764
Alexander Belopolsky40018472011-02-26 01:02:56 +00001765PyObject *
1766PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001767{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001768 if (size < 0) {
1769 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001770 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001771 return NULL;
1772 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001773 if (u != NULL)
1774 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1775 else
1776 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001777}
1778
Alexander Belopolsky40018472011-02-26 01:02:56 +00001779PyObject *
1780PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001781{
1782 size_t size = strlen(u);
1783 if (size > PY_SSIZE_T_MAX) {
1784 PyErr_SetString(PyExc_OverflowError, "input too long");
1785 return NULL;
1786 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001787 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001788}
1789
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001790PyObject *
1791_PyUnicode_FromId(_Py_Identifier *id)
1792{
1793 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001794 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1795 strlen(id->string),
1796 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001797 if (!id->object)
1798 return NULL;
1799 PyUnicode_InternInPlace(&id->object);
1800 assert(!id->next);
1801 id->next = static_strings;
1802 static_strings = id;
1803 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001804 return id->object;
1805}
1806
1807void
1808_PyUnicode_ClearStaticStrings()
1809{
1810 _Py_Identifier *i;
1811 for (i = static_strings; i; i = i->next) {
1812 Py_DECREF(i->object);
1813 i->object = NULL;
1814 i->next = NULL;
1815 }
1816}
1817
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001818/* Internal function, don't check maximum character */
1819
Victor Stinnere57b1c02011-09-28 22:20:48 +02001820static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001821unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001822{
Victor Stinner785938e2011-12-11 20:09:03 +01001823 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001824 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001825#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001826 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001827#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001828 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001829 }
Victor Stinner785938e2011-12-11 20:09:03 +01001830 unicode = PyUnicode_New(size, 127);
1831 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001832 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001833 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1834 assert(_PyUnicode_CheckConsistency(unicode, 1));
1835 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001836}
1837
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001838static Py_UCS4
1839kind_maxchar_limit(unsigned int kind)
1840{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001841 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001842 case PyUnicode_1BYTE_KIND:
1843 return 0x80;
1844 case PyUnicode_2BYTE_KIND:
1845 return 0x100;
1846 case PyUnicode_4BYTE_KIND:
1847 return 0x10000;
1848 default:
1849 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001850 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001851 }
1852}
1853
Victor Stinner702c7342011-10-05 13:50:52 +02001854static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001855_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001856{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001858 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001859
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001860 if (size == 0) {
1861 Py_INCREF(unicode_empty);
1862 return unicode_empty;
1863 }
1864 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001865 if (size == 1)
1866 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001867
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001868 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001869 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001870 if (!res)
1871 return NULL;
1872 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001873 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001875}
1876
Victor Stinnere57b1c02011-09-28 22:20:48 +02001877static PyObject*
1878_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879{
1880 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001881 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001882
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001883 if (size == 0) {
1884 Py_INCREF(unicode_empty);
1885 return unicode_empty;
1886 }
1887 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001888 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001889 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001890
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001891 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001892 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893 if (!res)
1894 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001895 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001897 else {
1898 _PyUnicode_CONVERT_BYTES(
1899 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1900 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001901 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 return res;
1903}
1904
Victor Stinnere57b1c02011-09-28 22:20:48 +02001905static PyObject*
1906_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907{
1908 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001909 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001910
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001911 if (size == 0) {
1912 Py_INCREF(unicode_empty);
1913 return unicode_empty;
1914 }
1915 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001916 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001917 return get_latin1_char((unsigned char)u[0]);
1918
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001919 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001920 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 if (!res)
1922 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001923 if (max_char < 256)
1924 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1925 PyUnicode_1BYTE_DATA(res));
1926 else if (max_char < 0x10000)
1927 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1928 PyUnicode_2BYTE_DATA(res));
1929 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001931 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 return res;
1933}
1934
1935PyObject*
1936PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1937{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001938 if (size < 0) {
1939 PyErr_SetString(PyExc_ValueError, "size must be positive");
1940 return NULL;
1941 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001942 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001944 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001945 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001946 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001948 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001949 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001950 PyErr_SetString(PyExc_SystemError, "invalid kind");
1951 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953}
1954
Victor Stinner25a4b292011-10-06 12:31:55 +02001955/* Ensure that a string uses the most efficient storage, if it is not the
1956 case: create a new string with of the right kind. Write NULL into *p_unicode
1957 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001958static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001959unicode_adjust_maxchar(PyObject **p_unicode)
1960{
1961 PyObject *unicode, *copy;
1962 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001963 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001964 unsigned int kind;
1965
1966 assert(p_unicode != NULL);
1967 unicode = *p_unicode;
1968 assert(PyUnicode_IS_READY(unicode));
1969 if (PyUnicode_IS_ASCII(unicode))
1970 return;
1971
1972 len = PyUnicode_GET_LENGTH(unicode);
1973 kind = PyUnicode_KIND(unicode);
1974 if (kind == PyUnicode_1BYTE_KIND) {
1975 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001976 max_char = ucs1lib_find_max_char(u, u + len);
1977 if (max_char >= 128)
1978 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001979 }
1980 else if (kind == PyUnicode_2BYTE_KIND) {
1981 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001982 max_char = ucs2lib_find_max_char(u, u + len);
1983 if (max_char >= 256)
1984 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001985 }
1986 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001987 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001988 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001989 max_char = ucs4lib_find_max_char(u, u + len);
1990 if (max_char >= 0x10000)
1991 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001992 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001993 copy = PyUnicode_New(len, max_char);
1994 copy_characters(copy, 0, unicode, 0, len);
1995 Py_DECREF(unicode);
1996 *p_unicode = copy;
1997}
1998
Victor Stinner034f6cf2011-09-30 02:26:44 +02001999PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002000_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002001{
Victor Stinner87af4f22011-11-21 23:03:47 +01002002 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002003 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002004
Victor Stinner034f6cf2011-09-30 02:26:44 +02002005 if (!PyUnicode_Check(unicode)) {
2006 PyErr_BadInternalCall();
2007 return NULL;
2008 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002009 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002010 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002011
Victor Stinner87af4f22011-11-21 23:03:47 +01002012 length = PyUnicode_GET_LENGTH(unicode);
2013 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002014 if (!copy)
2015 return NULL;
2016 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2017
Victor Stinner87af4f22011-11-21 23:03:47 +01002018 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2019 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002020 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002021 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002022}
2023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024
Victor Stinnerbc603d12011-10-02 01:00:40 +02002025/* Widen Unicode objects to larger buffers. Don't write terminating null
2026 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027
2028void*
2029_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2030{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002031 Py_ssize_t len;
2032 void *result;
2033 unsigned int skind;
2034
Benjamin Petersonbac79492012-01-14 13:34:47 -05002035 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002036 return NULL;
2037
2038 len = PyUnicode_GET_LENGTH(s);
2039 skind = PyUnicode_KIND(s);
2040 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002041 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 return NULL;
2043 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002044 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002045 case PyUnicode_2BYTE_KIND:
2046 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2047 if (!result)
2048 return PyErr_NoMemory();
2049 assert(skind == PyUnicode_1BYTE_KIND);
2050 _PyUnicode_CONVERT_BYTES(
2051 Py_UCS1, Py_UCS2,
2052 PyUnicode_1BYTE_DATA(s),
2053 PyUnicode_1BYTE_DATA(s) + len,
2054 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002056 case PyUnicode_4BYTE_KIND:
2057 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2058 if (!result)
2059 return PyErr_NoMemory();
2060 if (skind == PyUnicode_2BYTE_KIND) {
2061 _PyUnicode_CONVERT_BYTES(
2062 Py_UCS2, Py_UCS4,
2063 PyUnicode_2BYTE_DATA(s),
2064 PyUnicode_2BYTE_DATA(s) + len,
2065 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002067 else {
2068 assert(skind == PyUnicode_1BYTE_KIND);
2069 _PyUnicode_CONVERT_BYTES(
2070 Py_UCS1, Py_UCS4,
2071 PyUnicode_1BYTE_DATA(s),
2072 PyUnicode_1BYTE_DATA(s) + len,
2073 result);
2074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002075 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002076 default:
2077 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002078 }
Victor Stinner01698042011-10-04 00:04:26 +02002079 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080 return NULL;
2081}
2082
2083static Py_UCS4*
2084as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2085 int copy_null)
2086{
2087 int kind;
2088 void *data;
2089 Py_ssize_t len, targetlen;
2090 if (PyUnicode_READY(string) == -1)
2091 return NULL;
2092 kind = PyUnicode_KIND(string);
2093 data = PyUnicode_DATA(string);
2094 len = PyUnicode_GET_LENGTH(string);
2095 targetlen = len;
2096 if (copy_null)
2097 targetlen++;
2098 if (!target) {
2099 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2100 PyErr_NoMemory();
2101 return NULL;
2102 }
2103 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2104 if (!target) {
2105 PyErr_NoMemory();
2106 return NULL;
2107 }
2108 }
2109 else {
2110 if (targetsize < targetlen) {
2111 PyErr_Format(PyExc_SystemError,
2112 "string is longer than the buffer");
2113 if (copy_null && 0 < targetsize)
2114 target[0] = 0;
2115 return NULL;
2116 }
2117 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002118 if (kind == PyUnicode_1BYTE_KIND) {
2119 Py_UCS1 *start = (Py_UCS1 *) data;
2120 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002121 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002122 else if (kind == PyUnicode_2BYTE_KIND) {
2123 Py_UCS2 *start = (Py_UCS2 *) data;
2124 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2125 }
2126 else {
2127 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002128 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002129 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002130 if (copy_null)
2131 target[len] = 0;
2132 return target;
2133}
2134
2135Py_UCS4*
2136PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2137 int copy_null)
2138{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002139 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 PyErr_BadInternalCall();
2141 return NULL;
2142 }
2143 return as_ucs4(string, target, targetsize, copy_null);
2144}
2145
2146Py_UCS4*
2147PyUnicode_AsUCS4Copy(PyObject *string)
2148{
2149 return as_ucs4(string, NULL, 0, 1);
2150}
2151
2152#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002153
Alexander Belopolsky40018472011-02-26 01:02:56 +00002154PyObject *
2155PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002158 if (size == 0) {
2159 Py_INCREF(unicode_empty);
2160 return unicode_empty;
2161 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002162 PyErr_BadInternalCall();
2163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164 }
2165
Martin v. Löwis790465f2008-04-05 20:41:37 +00002166 if (size == -1) {
2167 size = wcslen(w);
2168 }
2169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171}
2172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002174
Walter Dörwald346737f2007-05-31 10:44:43 +00002175static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002176makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2177 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002178{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002179 *fmt++ = '%';
2180 if (width) {
2181 if (zeropad)
2182 *fmt++ = '0';
2183 fmt += sprintf(fmt, "%d", width);
2184 }
2185 if (precision)
2186 fmt += sprintf(fmt, ".%d", precision);
2187 if (longflag)
2188 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002189 else if (longlongflag) {
2190 /* longlongflag should only ever be nonzero on machines with
2191 HAVE_LONG_LONG defined */
2192#ifdef HAVE_LONG_LONG
2193 char *f = PY_FORMAT_LONG_LONG;
2194 while (*f)
2195 *fmt++ = *f++;
2196#else
2197 /* we shouldn't ever get here */
2198 assert(0);
2199 *fmt++ = 'l';
2200#endif
2201 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002202 else if (size_tflag) {
2203 char *f = PY_FORMAT_SIZE_T;
2204 while (*f)
2205 *fmt++ = *f++;
2206 }
2207 *fmt++ = c;
2208 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002209}
2210
Victor Stinner96865452011-03-01 23:44:09 +00002211/* helper for PyUnicode_FromFormatV() */
2212
2213static const char*
2214parse_format_flags(const char *f,
2215 int *p_width, int *p_precision,
2216 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2217{
2218 int width, precision, longflag, longlongflag, size_tflag;
2219
2220 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2221 f++;
2222 width = 0;
2223 while (Py_ISDIGIT((unsigned)*f))
2224 width = (width*10) + *f++ - '0';
2225 precision = 0;
2226 if (*f == '.') {
2227 f++;
2228 while (Py_ISDIGIT((unsigned)*f))
2229 precision = (precision*10) + *f++ - '0';
2230 if (*f == '%') {
2231 /* "%.3%s" => f points to "3" */
2232 f--;
2233 }
2234 }
2235 if (*f == '\0') {
2236 /* bogus format "%.1" => go backward, f points to "1" */
2237 f--;
2238 }
2239 if (p_width != NULL)
2240 *p_width = width;
2241 if (p_precision != NULL)
2242 *p_precision = precision;
2243
2244 /* Handle %ld, %lu, %lld and %llu. */
2245 longflag = 0;
2246 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002247 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002248
2249 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002250 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002251 longflag = 1;
2252 ++f;
2253 }
2254#ifdef HAVE_LONG_LONG
2255 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002256 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002257 longlongflag = 1;
2258 f += 2;
2259 }
2260#endif
2261 }
2262 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002263 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002264 size_tflag = 1;
2265 ++f;
2266 }
2267 if (p_longflag != NULL)
2268 *p_longflag = longflag;
2269 if (p_longlongflag != NULL)
2270 *p_longlongflag = longlongflag;
2271 if (p_size_tflag != NULL)
2272 *p_size_tflag = size_tflag;
2273 return f;
2274}
2275
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002276/* maximum number of characters required for output of %ld. 21 characters
2277 allows for 64-bit integers (in decimal) and an optional sign. */
2278#define MAX_LONG_CHARS 21
2279/* maximum number of characters required for output of %lld.
2280 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2281 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2282#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2283
Walter Dörwaldd2034312007-05-18 16:29:38 +00002284PyObject *
2285PyUnicode_FromFormatV(const char *format, va_list vargs)
2286{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002287 va_list count;
2288 Py_ssize_t callcount = 0;
2289 PyObject **callresults = NULL;
2290 PyObject **callresult = NULL;
2291 Py_ssize_t n = 0;
2292 int width = 0;
2293 int precision = 0;
2294 int zeropad;
2295 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002296 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002297 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002298 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002299 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2300 Py_UCS4 argmaxchar;
2301 Py_ssize_t numbersize = 0;
2302 char *numberresults = NULL;
2303 char *numberresult = NULL;
2304 Py_ssize_t i;
2305 int kind;
2306 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002307
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002308 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002309 /* step 1: count the number of %S/%R/%A/%s format specifications
2310 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2311 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002313 * also estimate a upper bound for all the number formats in the string,
2314 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002315 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002316 for (f = format; *f; f++) {
2317 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002318 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002319 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2320 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2321 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2322 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002324 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002325#ifdef HAVE_LONG_LONG
2326 if (longlongflag) {
2327 if (width < MAX_LONG_LONG_CHARS)
2328 width = MAX_LONG_LONG_CHARS;
2329 }
2330 else
2331#endif
2332 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2333 including sign. Decimal takes the most space. This
2334 isn't enough for octal. If a width is specified we
2335 need more (which we allocate later). */
2336 if (width < MAX_LONG_CHARS)
2337 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002338
2339 /* account for the size + '\0' to separate numbers
2340 inside of the numberresults buffer */
2341 numbersize += (width + 1);
2342 }
2343 }
2344 else if ((unsigned char)*f > 127) {
2345 PyErr_Format(PyExc_ValueError,
2346 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2347 "string, got a non-ASCII byte: 0x%02x",
2348 (unsigned char)*f);
2349 return NULL;
2350 }
2351 }
2352 /* step 2: allocate memory for the results of
2353 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2354 if (callcount) {
2355 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2356 if (!callresults) {
2357 PyErr_NoMemory();
2358 return NULL;
2359 }
2360 callresult = callresults;
2361 }
2362 /* step 2.5: allocate memory for the results of formating numbers */
2363 if (numbersize) {
2364 numberresults = PyObject_Malloc(numbersize);
2365 if (!numberresults) {
2366 PyErr_NoMemory();
2367 goto fail;
2368 }
2369 numberresult = numberresults;
2370 }
2371
2372 /* step 3: format numbers and figure out how large a buffer we need */
2373 for (f = format; *f; f++) {
2374 if (*f == '%') {
2375 const char* p;
2376 int longflag;
2377 int longlongflag;
2378 int size_tflag;
2379 int numprinted;
2380
2381 p = f;
2382 zeropad = (f[1] == '0');
2383 f = parse_format_flags(f, &width, &precision,
2384 &longflag, &longlongflag, &size_tflag);
2385 switch (*f) {
2386 case 'c':
2387 {
2388 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002389 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 n++;
2391 break;
2392 }
2393 case '%':
2394 n++;
2395 break;
2396 case 'i':
2397 case 'd':
2398 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2399 width, precision, *f);
2400 if (longflag)
2401 numprinted = sprintf(numberresult, fmt,
2402 va_arg(count, long));
2403#ifdef HAVE_LONG_LONG
2404 else if (longlongflag)
2405 numprinted = sprintf(numberresult, fmt,
2406 va_arg(count, PY_LONG_LONG));
2407#endif
2408 else if (size_tflag)
2409 numprinted = sprintf(numberresult, fmt,
2410 va_arg(count, Py_ssize_t));
2411 else
2412 numprinted = sprintf(numberresult, fmt,
2413 va_arg(count, int));
2414 n += numprinted;
2415 /* advance by +1 to skip over the '\0' */
2416 numberresult += (numprinted + 1);
2417 assert(*(numberresult - 1) == '\0');
2418 assert(*(numberresult - 2) != '\0');
2419 assert(numprinted >= 0);
2420 assert(numberresult <= numberresults + numbersize);
2421 break;
2422 case 'u':
2423 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2424 width, precision, 'u');
2425 if (longflag)
2426 numprinted = sprintf(numberresult, fmt,
2427 va_arg(count, unsigned long));
2428#ifdef HAVE_LONG_LONG
2429 else if (longlongflag)
2430 numprinted = sprintf(numberresult, fmt,
2431 va_arg(count, unsigned PY_LONG_LONG));
2432#endif
2433 else if (size_tflag)
2434 numprinted = sprintf(numberresult, fmt,
2435 va_arg(count, size_t));
2436 else
2437 numprinted = sprintf(numberresult, fmt,
2438 va_arg(count, unsigned int));
2439 n += numprinted;
2440 numberresult += (numprinted + 1);
2441 assert(*(numberresult - 1) == '\0');
2442 assert(*(numberresult - 2) != '\0');
2443 assert(numprinted >= 0);
2444 assert(numberresult <= numberresults + numbersize);
2445 break;
2446 case 'x':
2447 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2448 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2449 n += numprinted;
2450 numberresult += (numprinted + 1);
2451 assert(*(numberresult - 1) == '\0');
2452 assert(*(numberresult - 2) != '\0');
2453 assert(numprinted >= 0);
2454 assert(numberresult <= numberresults + numbersize);
2455 break;
2456 case 'p':
2457 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2458 /* %p is ill-defined: ensure leading 0x. */
2459 if (numberresult[1] == 'X')
2460 numberresult[1] = 'x';
2461 else if (numberresult[1] != 'x') {
2462 memmove(numberresult + 2, numberresult,
2463 strlen(numberresult) + 1);
2464 numberresult[0] = '0';
2465 numberresult[1] = 'x';
2466 numprinted += 2;
2467 }
2468 n += numprinted;
2469 numberresult += (numprinted + 1);
2470 assert(*(numberresult - 1) == '\0');
2471 assert(*(numberresult - 2) != '\0');
2472 assert(numprinted >= 0);
2473 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002474 break;
2475 case 's':
2476 {
2477 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002478 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002479 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002480 if (!str)
2481 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002482 /* since PyUnicode_DecodeUTF8 returns already flexible
2483 unicode objects, there is no need to call ready on them */
2484 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002485 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002487 /* Remember the str and switch to the next slot */
2488 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002489 break;
2490 }
2491 case 'U':
2492 {
2493 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002494 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 if (PyUnicode_READY(obj) == -1)
2496 goto fail;
2497 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002498 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 break;
2501 }
2502 case 'V':
2503 {
2504 PyObject *obj = va_arg(count, PyObject *);
2505 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002506 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002507 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002508 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002509 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002510 if (PyUnicode_READY(obj) == -1)
2511 goto fail;
2512 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002513 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002514 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002515 *callresult++ = NULL;
2516 }
2517 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002518 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002519 if (!str_obj)
2520 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002521 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002522 Py_DECREF(str_obj);
2523 goto fail;
2524 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002526 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002527 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002528 *callresult++ = str_obj;
2529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 break;
2531 }
2532 case 'S':
2533 {
2534 PyObject *obj = va_arg(count, PyObject *);
2535 PyObject *str;
2536 assert(obj);
2537 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002538 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002539 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002540 if (PyUnicode_READY(str) == -1) {
2541 Py_DECREF(str);
2542 goto fail;
2543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002544 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002545 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002546 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002547 /* Remember the str and switch to the next slot */
2548 *callresult++ = str;
2549 break;
2550 }
2551 case 'R':
2552 {
2553 PyObject *obj = va_arg(count, PyObject *);
2554 PyObject *repr;
2555 assert(obj);
2556 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002557 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002559 if (PyUnicode_READY(repr) == -1) {
2560 Py_DECREF(repr);
2561 goto fail;
2562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002564 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002566 /* Remember the repr and switch to the next slot */
2567 *callresult++ = repr;
2568 break;
2569 }
2570 case 'A':
2571 {
2572 PyObject *obj = va_arg(count, PyObject *);
2573 PyObject *ascii;
2574 assert(obj);
2575 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002576 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002578 if (PyUnicode_READY(ascii) == -1) {
2579 Py_DECREF(ascii);
2580 goto fail;
2581 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002582 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002583 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002585 /* Remember the repr and switch to the next slot */
2586 *callresult++ = ascii;
2587 break;
2588 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 default:
2590 /* if we stumble upon an unknown
2591 formatting code, copy the rest of
2592 the format string to the output
2593 string. (we cannot just skip the
2594 code, since there's no way to know
2595 what's in the argument list) */
2596 n += strlen(p);
2597 goto expand;
2598 }
2599 } else
2600 n++;
2601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002602 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002603 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 we don't have to resize the string.
2606 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002607 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002608 if (!string)
2609 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 kind = PyUnicode_KIND(string);
2611 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002612 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002617 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002618
2619 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2621 /* checking for == because the last argument could be a empty
2622 string, which causes i to point to end, the assert at the end of
2623 the loop */
2624 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002625
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 switch (*f) {
2627 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002628 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002629 const int ordinal = va_arg(vargs, int);
2630 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002632 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002633 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002635 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002638 {
2639 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 /* unused, since we already have the result */
2641 if (*f == 'p')
2642 (void) va_arg(vargs, void *);
2643 else
2644 (void) va_arg(vargs, int);
2645 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002646 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002647 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002648 i += written;
2649 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002650 assert(*numberresult == '\0');
2651 numberresult++;
2652 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002653 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002654 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 case 's':
2656 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002657 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002659 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002660 size = PyUnicode_GET_LENGTH(*callresult);
2661 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002662 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002664 /* We're done with the unicode()/repr() => forget it */
2665 Py_DECREF(*callresult);
2666 /* switch to next unicode()/repr() result */
2667 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 break;
2669 }
2670 case 'U':
2671 {
2672 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 Py_ssize_t size;
2674 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2675 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002676 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002677 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 break;
2679 }
2680 case 'V':
2681 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002683 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002684 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002685 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 size = PyUnicode_GET_LENGTH(obj);
2687 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002688 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002689 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002691 size = PyUnicode_GET_LENGTH(*callresult);
2692 assert(PyUnicode_KIND(*callresult) <=
2693 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002694 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002696 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002697 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002698 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 break;
2700 }
2701 case 'S':
2702 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002703 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002704 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002705 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 /* unused, since we already have the result */
2707 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002708 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002709 copy_characters(string, i, *callresult, 0, size);
2710 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002711 /* We're done with the unicode()/repr() => forget it */
2712 Py_DECREF(*callresult);
2713 /* switch to next unicode()/repr() result */
2714 ++callresult;
2715 break;
2716 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002717 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002719 break;
2720 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002721 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002722 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002723 goto end;
2724 }
Victor Stinner1205f272010-09-11 00:54:47 +00002725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002726 else {
2727 assert(i < PyUnicode_GET_LENGTH(string));
2728 PyUnicode_WRITE(kind, data, i++, *f);
2729 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002730 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002732
Benjamin Peterson29060642009-01-31 22:14:21 +00002733 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002734 if (callresults)
2735 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 if (numberresults)
2737 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002738 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002739 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002740 if (callresults) {
2741 PyObject **callresult2 = callresults;
2742 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002743 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002744 ++callresult2;
2745 }
2746 PyObject_Free(callresults);
2747 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 if (numberresults)
2749 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002750 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002751}
2752
Walter Dörwaldd2034312007-05-18 16:29:38 +00002753PyObject *
2754PyUnicode_FromFormat(const char *format, ...)
2755{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002756 PyObject* ret;
2757 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002758
2759#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002760 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002761#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002762 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002763#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002764 ret = PyUnicode_FromFormatV(format, vargs);
2765 va_end(vargs);
2766 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002767}
2768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769#ifdef HAVE_WCHAR_H
2770
Victor Stinner5593d8a2010-10-02 11:11:27 +00002771/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2772 convert a Unicode object to a wide character string.
2773
Victor Stinnerd88d9832011-09-06 02:00:05 +02002774 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002775 character) required to convert the unicode object. Ignore size argument.
2776
Victor Stinnerd88d9832011-09-06 02:00:05 +02002777 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002778 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002779 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002780static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002781unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002782 wchar_t *w,
2783 Py_ssize_t size)
2784{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002785 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002786 const wchar_t *wstr;
2787
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002788 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789 if (wstr == NULL)
2790 return -1;
2791
Victor Stinner5593d8a2010-10-02 11:11:27 +00002792 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002793 if (size > res)
2794 size = res + 1;
2795 else
2796 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002798 return res;
2799 }
2800 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002802}
2803
2804Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002805PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002806 wchar_t *w,
2807 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808{
2809 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002810 PyErr_BadInternalCall();
2811 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002813 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814}
2815
Victor Stinner137c34c2010-09-29 10:25:54 +00002816wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002817PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002818 Py_ssize_t *size)
2819{
2820 wchar_t* buffer;
2821 Py_ssize_t buflen;
2822
2823 if (unicode == NULL) {
2824 PyErr_BadInternalCall();
2825 return NULL;
2826 }
2827
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002828 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002829 if (buflen == -1)
2830 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002831 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002832 PyErr_NoMemory();
2833 return NULL;
2834 }
2835
Victor Stinner137c34c2010-09-29 10:25:54 +00002836 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2837 if (buffer == NULL) {
2838 PyErr_NoMemory();
2839 return NULL;
2840 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002841 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002842 if (buflen == -1)
2843 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002844 if (size != NULL)
2845 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002846 return buffer;
2847}
2848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002849#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850
Alexander Belopolsky40018472011-02-26 01:02:56 +00002851PyObject *
2852PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002854 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002855 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002856 PyErr_SetString(PyExc_ValueError,
2857 "chr() arg not in range(0x110000)");
2858 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002859 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002861 if (ordinal < 256)
2862 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002864 v = PyUnicode_New(1, ordinal);
2865 if (v == NULL)
2866 return NULL;
2867 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002868 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002869 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002870}
2871
Alexander Belopolsky40018472011-02-26 01:02:56 +00002872PyObject *
2873PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002875 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002876 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002877 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002878 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002879 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002880 Py_INCREF(obj);
2881 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002882 }
2883 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002884 /* For a Unicode subtype that's not a Unicode object,
2885 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002886 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002887 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002888 PyErr_Format(PyExc_TypeError,
2889 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002890 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002891 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002892}
2893
Alexander Belopolsky40018472011-02-26 01:02:56 +00002894PyObject *
2895PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002896 const char *encoding,
2897 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002898{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002899 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002900 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002901
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002903 PyErr_BadInternalCall();
2904 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002906
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002907 /* Decoding bytes objects is the most common case and should be fast */
2908 if (PyBytes_Check(obj)) {
2909 if (PyBytes_GET_SIZE(obj) == 0) {
2910 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002911 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002912 }
2913 else {
2914 v = PyUnicode_Decode(
2915 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2916 encoding, errors);
2917 }
2918 return v;
2919 }
2920
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002921 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002922 PyErr_SetString(PyExc_TypeError,
2923 "decoding str is not supported");
2924 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002925 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002926
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002927 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2928 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2929 PyErr_Format(PyExc_TypeError,
2930 "coercing to str: need bytes, bytearray "
2931 "or buffer-like object, %.80s found",
2932 Py_TYPE(obj)->tp_name);
2933 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002934 }
Tim Petersced69f82003-09-16 20:30:58 +00002935
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002936 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002937 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002938 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 }
Tim Petersced69f82003-09-16 20:30:58 +00002940 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002941 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002942
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002943 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002944 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945}
2946
Victor Stinner600d3be2010-06-10 12:00:55 +00002947/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002948 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2949 1 on success. */
2950static int
2951normalize_encoding(const char *encoding,
2952 char *lower,
2953 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002955 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002956 char *l;
2957 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002958
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002959 if (encoding == NULL) {
2960 strcpy(lower, "utf-8");
2961 return 1;
2962 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002963 e = encoding;
2964 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002965 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002966 while (*e) {
2967 if (l == l_end)
2968 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002969 if (Py_ISUPPER(*e)) {
2970 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002971 }
2972 else if (*e == '_') {
2973 *l++ = '-';
2974 e++;
2975 }
2976 else {
2977 *l++ = *e++;
2978 }
2979 }
2980 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002981 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002982}
2983
Alexander Belopolsky40018472011-02-26 01:02:56 +00002984PyObject *
2985PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002986 Py_ssize_t size,
2987 const char *encoding,
2988 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002989{
2990 PyObject *buffer = NULL, *unicode;
2991 Py_buffer info;
2992 char lower[11]; /* Enough for any encoding shortcut */
2993
Fred Drakee4315f52000-05-09 19:53:39 +00002994 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002995 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002996 if ((strcmp(lower, "utf-8") == 0) ||
2997 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002998 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002999 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003000 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003001 (strcmp(lower, "iso-8859-1") == 0))
3002 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003003#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003004 else if (strcmp(lower, "mbcs") == 0)
3005 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003006#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003007 else if (strcmp(lower, "ascii") == 0)
3008 return PyUnicode_DecodeASCII(s, size, errors);
3009 else if (strcmp(lower, "utf-16") == 0)
3010 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3011 else if (strcmp(lower, "utf-32") == 0)
3012 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014
3015 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003016 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003017 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003018 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003019 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 if (buffer == NULL)
3021 goto onError;
3022 unicode = PyCodec_Decode(buffer, encoding, errors);
3023 if (unicode == NULL)
3024 goto onError;
3025 if (!PyUnicode_Check(unicode)) {
3026 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003027 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003028 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 Py_DECREF(unicode);
3030 goto onError;
3031 }
3032 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003033 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003034
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 Py_XDECREF(buffer);
3037 return NULL;
3038}
3039
Alexander Belopolsky40018472011-02-26 01:02:56 +00003040PyObject *
3041PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003042 const char *encoding,
3043 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003044{
3045 PyObject *v;
3046
3047 if (!PyUnicode_Check(unicode)) {
3048 PyErr_BadArgument();
3049 goto onError;
3050 }
3051
3052 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003054
3055 /* Decode via the codec registry */
3056 v = PyCodec_Decode(unicode, encoding, errors);
3057 if (v == NULL)
3058 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003059 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003060
Benjamin Peterson29060642009-01-31 22:14:21 +00003061 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003062 return NULL;
3063}
3064
Alexander Belopolsky40018472011-02-26 01:02:56 +00003065PyObject *
3066PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003067 const char *encoding,
3068 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003069{
3070 PyObject *v;
3071
3072 if (!PyUnicode_Check(unicode)) {
3073 PyErr_BadArgument();
3074 goto onError;
3075 }
3076
3077 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003079
3080 /* Decode via the codec registry */
3081 v = PyCodec_Decode(unicode, encoding, errors);
3082 if (v == NULL)
3083 goto onError;
3084 if (!PyUnicode_Check(v)) {
3085 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003086 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003087 Py_TYPE(v)->tp_name);
3088 Py_DECREF(v);
3089 goto onError;
3090 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003091 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003092
Benjamin Peterson29060642009-01-31 22:14:21 +00003093 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003094 return NULL;
3095}
3096
Alexander Belopolsky40018472011-02-26 01:02:56 +00003097PyObject *
3098PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003099 Py_ssize_t size,
3100 const char *encoding,
3101 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102{
3103 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003104
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 unicode = PyUnicode_FromUnicode(s, size);
3106 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003107 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3109 Py_DECREF(unicode);
3110 return v;
3111}
3112
Alexander Belopolsky40018472011-02-26 01:02:56 +00003113PyObject *
3114PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003115 const char *encoding,
3116 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003117{
3118 PyObject *v;
3119
3120 if (!PyUnicode_Check(unicode)) {
3121 PyErr_BadArgument();
3122 goto onError;
3123 }
3124
3125 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003126 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003127
3128 /* Encode via the codec registry */
3129 v = PyCodec_Encode(unicode, encoding, errors);
3130 if (v == NULL)
3131 goto onError;
3132 return v;
3133
Benjamin Peterson29060642009-01-31 22:14:21 +00003134 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003135 return NULL;
3136}
3137
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003138static size_t
3139wcstombs_errorpos(const wchar_t *wstr)
3140{
3141 size_t len;
3142#if SIZEOF_WCHAR_T == 2
3143 wchar_t buf[3];
3144#else
3145 wchar_t buf[2];
3146#endif
3147 char outbuf[MB_LEN_MAX];
3148 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003149
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003150#if SIZEOF_WCHAR_T == 2
3151 buf[2] = 0;
3152#else
3153 buf[1] = 0;
3154#endif
3155 start = wstr;
3156 while (*wstr != L'\0')
3157 {
3158 previous = wstr;
3159#if SIZEOF_WCHAR_T == 2
3160 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3161 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3162 {
3163 buf[0] = wstr[0];
3164 buf[1] = wstr[1];
3165 wstr += 2;
3166 }
3167 else {
3168 buf[0] = *wstr;
3169 buf[1] = 0;
3170 wstr++;
3171 }
3172#else
3173 buf[0] = *wstr;
3174 wstr++;
3175#endif
3176 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003177 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003178 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003179 }
3180
3181 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003182 return 0;
3183}
3184
Victor Stinner1b579672011-12-17 05:47:23 +01003185static int
3186locale_error_handler(const char *errors, int *surrogateescape)
3187{
3188 if (errors == NULL) {
3189 *surrogateescape = 0;
3190 return 0;
3191 }
3192
3193 if (strcmp(errors, "strict") == 0) {
3194 *surrogateescape = 0;
3195 return 0;
3196 }
3197 if (strcmp(errors, "surrogateescape") == 0) {
3198 *surrogateescape = 1;
3199 return 0;
3200 }
3201 PyErr_Format(PyExc_ValueError,
3202 "only 'strict' and 'surrogateescape' error handlers "
3203 "are supported, not '%s'",
3204 errors);
3205 return -1;
3206}
3207
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003208PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003209PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003210{
3211 Py_ssize_t wlen, wlen2;
3212 wchar_t *wstr;
3213 PyObject *bytes = NULL;
3214 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003215 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003216 PyObject *exc;
3217 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003218 int surrogateescape;
3219
3220 if (locale_error_handler(errors, &surrogateescape) < 0)
3221 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003222
3223 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3224 if (wstr == NULL)
3225 return NULL;
3226
3227 wlen2 = wcslen(wstr);
3228 if (wlen2 != wlen) {
3229 PyMem_Free(wstr);
3230 PyErr_SetString(PyExc_TypeError, "embedded null character");
3231 return NULL;
3232 }
3233
3234 if (surrogateescape) {
3235 /* locale encoding with surrogateescape */
3236 char *str;
3237
3238 str = _Py_wchar2char(wstr, &error_pos);
3239 if (str == NULL) {
3240 if (error_pos == (size_t)-1) {
3241 PyErr_NoMemory();
3242 PyMem_Free(wstr);
3243 return NULL;
3244 }
3245 else {
3246 goto encode_error;
3247 }
3248 }
3249 PyMem_Free(wstr);
3250
3251 bytes = PyBytes_FromString(str);
3252 PyMem_Free(str);
3253 }
3254 else {
3255 size_t len, len2;
3256
3257 len = wcstombs(NULL, wstr, 0);
3258 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003259 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003260 goto encode_error;
3261 }
3262
3263 bytes = PyBytes_FromStringAndSize(NULL, len);
3264 if (bytes == NULL) {
3265 PyMem_Free(wstr);
3266 return NULL;
3267 }
3268
3269 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3270 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003271 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003272 goto encode_error;
3273 }
3274 PyMem_Free(wstr);
3275 }
3276 return bytes;
3277
3278encode_error:
3279 errmsg = strerror(errno);
3280 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003281
3282 if (error_pos == (size_t)-1)
3283 error_pos = wcstombs_errorpos(wstr);
3284
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003285 PyMem_Free(wstr);
3286 Py_XDECREF(bytes);
3287
Victor Stinner2f197072011-12-17 07:08:30 +01003288 if (errmsg != NULL) {
3289 size_t errlen;
3290 wstr = _Py_char2wchar(errmsg, &errlen);
3291 if (wstr != NULL) {
3292 reason = PyUnicode_FromWideChar(wstr, errlen);
3293 PyMem_Free(wstr);
3294 } else
3295 errmsg = NULL;
3296 }
3297 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003298 reason = PyUnicode_FromString(
3299 "wcstombs() encountered an unencodable "
3300 "wide character");
3301 if (reason == NULL)
3302 return NULL;
3303
3304 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3305 "locale", unicode,
3306 (Py_ssize_t)error_pos,
3307 (Py_ssize_t)(error_pos+1),
3308 reason);
3309 Py_DECREF(reason);
3310 if (exc != NULL) {
3311 PyCodec_StrictErrors(exc);
3312 Py_XDECREF(exc);
3313 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003314 return NULL;
3315}
3316
Victor Stinnerad158722010-10-27 00:25:46 +00003317PyObject *
3318PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003319{
Victor Stinner99b95382011-07-04 14:23:54 +02003320#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003321 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003322#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003323 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003324#else
Victor Stinner793b5312011-04-27 00:24:21 +02003325 PyInterpreterState *interp = PyThreadState_GET()->interp;
3326 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3327 cannot use it to encode and decode filenames before it is loaded. Load
3328 the Python codec requires to encode at least its own filename. Use the C
3329 version of the locale codec until the codec registry is initialized and
3330 the Python codec is loaded.
3331
3332 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3333 cannot only rely on it: check also interp->fscodec_initialized for
3334 subinterpreters. */
3335 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003336 return PyUnicode_AsEncodedString(unicode,
3337 Py_FileSystemDefaultEncoding,
3338 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003339 }
3340 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003341 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003342 }
Victor Stinnerad158722010-10-27 00:25:46 +00003343#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003344}
3345
Alexander Belopolsky40018472011-02-26 01:02:56 +00003346PyObject *
3347PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003348 const char *encoding,
3349 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350{
3351 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003352 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003353
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354 if (!PyUnicode_Check(unicode)) {
3355 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003356 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 }
Fred Drakee4315f52000-05-09 19:53:39 +00003358
Fred Drakee4315f52000-05-09 19:53:39 +00003359 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003360 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003361 if ((strcmp(lower, "utf-8") == 0) ||
3362 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003363 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003364 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003365 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003366 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003367 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003368 }
Victor Stinner37296e82010-06-10 13:36:23 +00003369 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003370 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003371 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003372 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003373#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003374 else if (strcmp(lower, "mbcs") == 0)
3375 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003376#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003377 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003378 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380
3381 /* Encode via the codec registry */
3382 v = PyCodec_Encode(unicode, encoding, errors);
3383 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003384 return NULL;
3385
3386 /* The normal path */
3387 if (PyBytes_Check(v))
3388 return v;
3389
3390 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003391 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003392 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003393 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003394
3395 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3396 "encoder %s returned bytearray instead of bytes",
3397 encoding);
3398 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003399 Py_DECREF(v);
3400 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003401 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003402
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003403 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3404 Py_DECREF(v);
3405 return b;
3406 }
3407
3408 PyErr_Format(PyExc_TypeError,
3409 "encoder did not return a bytes object (type=%.400s)",
3410 Py_TYPE(v)->tp_name);
3411 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003412 return NULL;
3413}
3414
Alexander Belopolsky40018472011-02-26 01:02:56 +00003415PyObject *
3416PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003417 const char *encoding,
3418 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003419{
3420 PyObject *v;
3421
3422 if (!PyUnicode_Check(unicode)) {
3423 PyErr_BadArgument();
3424 goto onError;
3425 }
3426
3427 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003428 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003429
3430 /* Encode via the codec registry */
3431 v = PyCodec_Encode(unicode, encoding, errors);
3432 if (v == NULL)
3433 goto onError;
3434 if (!PyUnicode_Check(v)) {
3435 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003436 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003437 Py_TYPE(v)->tp_name);
3438 Py_DECREF(v);
3439 goto onError;
3440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003442
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 return NULL;
3445}
3446
Victor Stinner2f197072011-12-17 07:08:30 +01003447static size_t
3448mbstowcs_errorpos(const char *str, size_t len)
3449{
3450#ifdef HAVE_MBRTOWC
3451 const char *start = str;
3452 mbstate_t mbs;
3453 size_t converted;
3454 wchar_t ch;
3455
3456 memset(&mbs, 0, sizeof mbs);
3457 while (len)
3458 {
3459 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3460 if (converted == 0)
3461 /* Reached end of string */
3462 break;
3463 if (converted == (size_t)-1 || converted == (size_t)-2) {
3464 /* Conversion error or incomplete character */
3465 return str - start;
3466 }
3467 else {
3468 str += converted;
3469 len -= converted;
3470 }
3471 }
3472 /* failed to find the undecodable byte sequence */
3473 return 0;
3474#endif
3475 return 0;
3476}
3477
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003478PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003479PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003480 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003481{
3482 wchar_t smallbuf[256];
3483 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3484 wchar_t *wstr;
3485 size_t wlen, wlen2;
3486 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003487 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003488 size_t error_pos;
3489 char *errmsg;
3490 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003491
3492 if (locale_error_handler(errors, &surrogateescape) < 0)
3493 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003494
3495 if (str[len] != '\0' || len != strlen(str)) {
3496 PyErr_SetString(PyExc_TypeError, "embedded null character");
3497 return NULL;
3498 }
3499
3500 if (surrogateescape)
3501 {
3502 wstr = _Py_char2wchar(str, &wlen);
3503 if (wstr == NULL) {
3504 if (wlen == (size_t)-1)
3505 PyErr_NoMemory();
3506 else
3507 PyErr_SetFromErrno(PyExc_OSError);
3508 return NULL;
3509 }
3510
3511 unicode = PyUnicode_FromWideChar(wstr, wlen);
3512 PyMem_Free(wstr);
3513 }
3514 else {
3515#ifndef HAVE_BROKEN_MBSTOWCS
3516 wlen = mbstowcs(NULL, str, 0);
3517#else
3518 wlen = len;
3519#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003520 if (wlen == (size_t)-1)
3521 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003522 if (wlen+1 <= smallbuf_len) {
3523 wstr = smallbuf;
3524 }
3525 else {
3526 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3527 return PyErr_NoMemory();
3528
3529 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3530 if (!wstr)
3531 return PyErr_NoMemory();
3532 }
3533
3534 /* This shouldn't fail now */
3535 wlen2 = mbstowcs(wstr, str, wlen+1);
3536 if (wlen2 == (size_t)-1) {
3537 if (wstr != smallbuf)
3538 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003539 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003540 }
3541#ifdef HAVE_BROKEN_MBSTOWCS
3542 assert(wlen2 == wlen);
3543#endif
3544 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3545 if (wstr != smallbuf)
3546 PyMem_Free(wstr);
3547 }
3548 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003549
3550decode_error:
3551 errmsg = strerror(errno);
3552 assert(errmsg != NULL);
3553
3554 error_pos = mbstowcs_errorpos(str, len);
3555 if (errmsg != NULL) {
3556 size_t errlen;
3557 wstr = _Py_char2wchar(errmsg, &errlen);
3558 if (wstr != NULL) {
3559 reason = PyUnicode_FromWideChar(wstr, errlen);
3560 PyMem_Free(wstr);
3561 } else
3562 errmsg = NULL;
3563 }
3564 if (errmsg == NULL)
3565 reason = PyUnicode_FromString(
3566 "mbstowcs() encountered an invalid multibyte sequence");
3567 if (reason == NULL)
3568 return NULL;
3569
3570 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3571 "locale", str, len,
3572 (Py_ssize_t)error_pos,
3573 (Py_ssize_t)(error_pos+1),
3574 reason);
3575 Py_DECREF(reason);
3576 if (exc != NULL) {
3577 PyCodec_StrictErrors(exc);
3578 Py_XDECREF(exc);
3579 }
3580 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003581}
3582
3583PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003584PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003585{
3586 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003587 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003588}
3589
3590
3591PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003592PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003593 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003594 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3595}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003596
Christian Heimes5894ba72007-11-04 11:43:14 +00003597PyObject*
3598PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3599{
Victor Stinner99b95382011-07-04 14:23:54 +02003600#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003601 return PyUnicode_DecodeMBCS(s, size, NULL);
3602#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003603 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003604#else
Victor Stinner793b5312011-04-27 00:24:21 +02003605 PyInterpreterState *interp = PyThreadState_GET()->interp;
3606 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3607 cannot use it to encode and decode filenames before it is loaded. Load
3608 the Python codec requires to encode at least its own filename. Use the C
3609 version of the locale codec until the codec registry is initialized and
3610 the Python codec is loaded.
3611
3612 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3613 cannot only rely on it: check also interp->fscodec_initialized for
3614 subinterpreters. */
3615 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003616 return PyUnicode_Decode(s, size,
3617 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003618 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003619 }
3620 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003621 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003622 }
Victor Stinnerad158722010-10-27 00:25:46 +00003623#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003624}
3625
Martin v. Löwis011e8422009-05-05 04:43:17 +00003626
3627int
Antoine Pitrou13348842012-01-29 18:36:34 +01003628_PyUnicode_HasNULChars(PyObject* s)
3629{
3630 static PyObject *nul = NULL;
3631
3632 if (nul == NULL)
3633 nul = PyUnicode_FromStringAndSize("\0", 1);
3634 if (nul == NULL)
3635 return -1;
3636 return PyUnicode_Contains(s, nul);
3637}
3638
3639
3640int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003641PyUnicode_FSConverter(PyObject* arg, void* addr)
3642{
3643 PyObject *output = NULL;
3644 Py_ssize_t size;
3645 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003646 if (arg == NULL) {
3647 Py_DECREF(*(PyObject**)addr);
3648 return 1;
3649 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003650 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003651 output = arg;
3652 Py_INCREF(output);
3653 }
3654 else {
3655 arg = PyUnicode_FromObject(arg);
3656 if (!arg)
3657 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003658 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003659 Py_DECREF(arg);
3660 if (!output)
3661 return 0;
3662 if (!PyBytes_Check(output)) {
3663 Py_DECREF(output);
3664 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3665 return 0;
3666 }
3667 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003668 size = PyBytes_GET_SIZE(output);
3669 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003670 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003671 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003672 Py_DECREF(output);
3673 return 0;
3674 }
3675 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003676 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003677}
3678
3679
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003680int
3681PyUnicode_FSDecoder(PyObject* arg, void* addr)
3682{
3683 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003684 if (arg == NULL) {
3685 Py_DECREF(*(PyObject**)addr);
3686 return 1;
3687 }
3688 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003689 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003690 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003691 output = arg;
3692 Py_INCREF(output);
3693 }
3694 else {
3695 arg = PyBytes_FromObject(arg);
3696 if (!arg)
3697 return 0;
3698 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3699 PyBytes_GET_SIZE(arg));
3700 Py_DECREF(arg);
3701 if (!output)
3702 return 0;
3703 if (!PyUnicode_Check(output)) {
3704 Py_DECREF(output);
3705 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3706 return 0;
3707 }
3708 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003709 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003710 Py_DECREF(output);
3711 return 0;
3712 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003714 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003715 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3716 Py_DECREF(output);
3717 return 0;
3718 }
3719 *(PyObject**)addr = output;
3720 return Py_CLEANUP_SUPPORTED;
3721}
3722
3723
Martin v. Löwis5b222132007-06-10 09:51:05 +00003724char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003725PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003726{
Christian Heimesf3863112007-11-22 07:46:41 +00003727 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003729 if (!PyUnicode_Check(unicode)) {
3730 PyErr_BadArgument();
3731 return NULL;
3732 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003733 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003734 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003735
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003736 if (PyUnicode_UTF8(unicode) == NULL) {
3737 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003738 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3739 if (bytes == NULL)
3740 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003741 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3742 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003743 Py_DECREF(bytes);
3744 return NULL;
3745 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003746 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3747 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3748 PyBytes_AS_STRING(bytes),
3749 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003750 Py_DECREF(bytes);
3751 }
3752
3753 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003754 *psize = PyUnicode_UTF8_LENGTH(unicode);
3755 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003756}
3757
3758char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003759PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3762}
3763
3764#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003765static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766#endif
3767
3768
3769Py_UNICODE *
3770PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003772 const unsigned char *one_byte;
3773#if SIZEOF_WCHAR_T == 4
3774 const Py_UCS2 *two_bytes;
3775#else
3776 const Py_UCS4 *four_bytes;
3777 const Py_UCS4 *ucs4_end;
3778 Py_ssize_t num_surrogates;
3779#endif
3780 wchar_t *w;
3781 wchar_t *wchar_end;
3782
3783 if (!PyUnicode_Check(unicode)) {
3784 PyErr_BadArgument();
3785 return NULL;
3786 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003787 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003789 assert(_PyUnicode_KIND(unicode) != 0);
3790 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791
3792#ifdef Py_DEBUG
3793 ++unicode_as_unicode_calls;
3794#endif
3795
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003796 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003798 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3799 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003800 num_surrogates = 0;
3801
3802 for (; four_bytes < ucs4_end; ++four_bytes) {
3803 if (*four_bytes > 0xFFFF)
3804 ++num_surrogates;
3805 }
3806
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003807 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3808 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3809 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810 PyErr_NoMemory();
3811 return NULL;
3812 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003815 w = _PyUnicode_WSTR(unicode);
3816 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3817 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3819 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003820 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003822 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3823 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824 }
3825 else
3826 *w = *four_bytes;
3827
3828 if (w > wchar_end) {
3829 assert(0 && "Miscalculated string end");
3830 }
3831 }
3832 *w = 0;
3833#else
3834 /* sizeof(wchar_t) == 4 */
3835 Py_FatalError("Impossible unicode object state, wstr and str "
3836 "should share memory already.");
3837 return NULL;
3838#endif
3839 }
3840 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003841 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3842 (_PyUnicode_LENGTH(unicode) + 1));
3843 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844 PyErr_NoMemory();
3845 return NULL;
3846 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003847 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3848 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3849 w = _PyUnicode_WSTR(unicode);
3850 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003851
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003852 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3853 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 for (; w < wchar_end; ++one_byte, ++w)
3855 *w = *one_byte;
3856 /* null-terminate the wstr */
3857 *w = 0;
3858 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003859 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003861 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862 for (; w < wchar_end; ++two_bytes, ++w)
3863 *w = *two_bytes;
3864 /* null-terminate the wstr */
3865 *w = 0;
3866#else
3867 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003868 PyObject_FREE(_PyUnicode_WSTR(unicode));
3869 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870 Py_FatalError("Impossible unicode object state, wstr "
3871 "and str should share memory already.");
3872 return NULL;
3873#endif
3874 }
3875 else {
3876 assert(0 && "This should never happen.");
3877 }
3878 }
3879 }
3880 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003881 *size = PyUnicode_WSTR_LENGTH(unicode);
3882 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003883}
3884
Alexander Belopolsky40018472011-02-26 01:02:56 +00003885Py_UNICODE *
3886PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003888 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889}
3890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891
Alexander Belopolsky40018472011-02-26 01:02:56 +00003892Py_ssize_t
3893PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894{
3895 if (!PyUnicode_Check(unicode)) {
3896 PyErr_BadArgument();
3897 goto onError;
3898 }
3899 return PyUnicode_GET_SIZE(unicode);
3900
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 return -1;
3903}
3904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905Py_ssize_t
3906PyUnicode_GetLength(PyObject *unicode)
3907{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003908 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003909 PyErr_BadArgument();
3910 return -1;
3911 }
3912
3913 return PyUnicode_GET_LENGTH(unicode);
3914}
3915
3916Py_UCS4
3917PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3918{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003919 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3920 PyErr_BadArgument();
3921 return (Py_UCS4)-1;
3922 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003923 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003924 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003925 return (Py_UCS4)-1;
3926 }
3927 return PyUnicode_READ_CHAR(unicode, index);
3928}
3929
3930int
3931PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3932{
3933 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003934 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003935 return -1;
3936 }
Victor Stinner488fa492011-12-12 00:01:39 +01003937 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003938 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003939 PyErr_SetString(PyExc_IndexError, "string index out of range");
3940 return -1;
3941 }
Victor Stinner488fa492011-12-12 00:01:39 +01003942 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003943 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3945 index, ch);
3946 return 0;
3947}
3948
Alexander Belopolsky40018472011-02-26 01:02:56 +00003949const char *
3950PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003951{
Victor Stinner42cb4622010-09-01 19:39:01 +00003952 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003953}
3954
Victor Stinner554f3f02010-06-16 23:33:54 +00003955/* create or adjust a UnicodeDecodeError */
3956static void
3957make_decode_exception(PyObject **exceptionObject,
3958 const char *encoding,
3959 const char *input, Py_ssize_t length,
3960 Py_ssize_t startpos, Py_ssize_t endpos,
3961 const char *reason)
3962{
3963 if (*exceptionObject == NULL) {
3964 *exceptionObject = PyUnicodeDecodeError_Create(
3965 encoding, input, length, startpos, endpos, reason);
3966 }
3967 else {
3968 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3969 goto onError;
3970 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3971 goto onError;
3972 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3973 goto onError;
3974 }
3975 return;
3976
3977onError:
3978 Py_DECREF(*exceptionObject);
3979 *exceptionObject = NULL;
3980}
3981
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982/* error handling callback helper:
3983 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003984 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 and adjust various state variables.
3986 return 0 on success, -1 on error
3987*/
3988
Alexander Belopolsky40018472011-02-26 01:02:56 +00003989static int
3990unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003991 const char *encoding, const char *reason,
3992 const char **input, const char **inend, Py_ssize_t *startinpos,
3993 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003994 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003996 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003997
3998 PyObject *restuple = NULL;
3999 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004000 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004001 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004002 Py_ssize_t requiredsize;
4003 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004004 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 int res = -1;
4006
Victor Stinner596a6c42011-11-09 00:02:18 +01004007 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4008 outsize = PyUnicode_GET_LENGTH(*output);
4009 else
4010 outsize = _PyUnicode_WSTR_LENGTH(*output);
4011
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004012 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004013 *errorHandler = PyCodec_LookupError(errors);
4014 if (*errorHandler == NULL)
4015 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016 }
4017
Victor Stinner554f3f02010-06-16 23:33:54 +00004018 make_decode_exception(exceptionObject,
4019 encoding,
4020 *input, *inend - *input,
4021 *startinpos, *endinpos,
4022 reason);
4023 if (*exceptionObject == NULL)
4024 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025
4026 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4027 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004028 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004030 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004031 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 }
4033 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004034 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004035 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004036 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004037
4038 /* Copy back the bytes variables, which might have been modified by the
4039 callback */
4040 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4041 if (!inputobj)
4042 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004043 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004044 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004045 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004046 *input = PyBytes_AS_STRING(inputobj);
4047 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004048 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004049 /* we can DECREF safely, as the exception has another reference,
4050 so the object won't go away. */
4051 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004054 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004055 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004056 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4057 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004058 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059
Victor Stinner596a6c42011-11-09 00:02:18 +01004060 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4061 /* need more space? (at least enough for what we
4062 have+the replacement+the rest of the string (starting
4063 at the new input position), so we won't have to check space
4064 when there are no errors in the rest of the string) */
4065 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4066 requiredsize = *outpos + replen + insize-newpos;
4067 if (requiredsize > outsize) {
4068 if (requiredsize<2*outsize)
4069 requiredsize = 2*outsize;
4070 if (unicode_resize(output, requiredsize) < 0)
4071 goto onError;
4072 }
4073 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004074 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004075 copy_characters(*output, *outpos, repunicode, 0, replen);
4076 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004078 else {
4079 wchar_t *repwstr;
4080 Py_ssize_t repwlen;
4081 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4082 if (repwstr == NULL)
4083 goto onError;
4084 /* need more space? (at least enough for what we
4085 have+the replacement+the rest of the string (starting
4086 at the new input position), so we won't have to check space
4087 when there are no errors in the rest of the string) */
4088 requiredsize = *outpos + repwlen + insize-newpos;
4089 if (requiredsize > outsize) {
4090 if (requiredsize < 2*outsize)
4091 requiredsize = 2*outsize;
4092 if (unicode_resize(output, requiredsize) < 0)
4093 goto onError;
4094 }
4095 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4096 *outpos += repwlen;
4097 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004098 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004099 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004100
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101 /* we made it! */
4102 res = 0;
4103
Benjamin Peterson29060642009-01-31 22:14:21 +00004104 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004105 Py_XDECREF(restuple);
4106 return res;
4107}
4108
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004109/* --- UTF-7 Codec -------------------------------------------------------- */
4110
Antoine Pitrou244651a2009-05-04 18:56:13 +00004111/* See RFC2152 for details. We encode conservatively and decode liberally. */
4112
4113/* Three simple macros defining base-64. */
4114
4115/* Is c a base-64 character? */
4116
4117#define IS_BASE64(c) \
4118 (((c) >= 'A' && (c) <= 'Z') || \
4119 ((c) >= 'a' && (c) <= 'z') || \
4120 ((c) >= '0' && (c) <= '9') || \
4121 (c) == '+' || (c) == '/')
4122
4123/* given that c is a base-64 character, what is its base-64 value? */
4124
4125#define FROM_BASE64(c) \
4126 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4127 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4128 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4129 (c) == '+' ? 62 : 63)
4130
4131/* What is the base-64 character of the bottom 6 bits of n? */
4132
4133#define TO_BASE64(n) \
4134 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4135
4136/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4137 * decoded as itself. We are permissive on decoding; the only ASCII
4138 * byte not decoding to itself is the + which begins a base64
4139 * string. */
4140
4141#define DECODE_DIRECT(c) \
4142 ((c) <= 127 && (c) != '+')
4143
4144/* The UTF-7 encoder treats ASCII characters differently according to
4145 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4146 * the above). See RFC2152. This array identifies these different
4147 * sets:
4148 * 0 : "Set D"
4149 * alphanumeric and '(),-./:?
4150 * 1 : "Set O"
4151 * !"#$%&*;<=>@[]^_`{|}
4152 * 2 : "whitespace"
4153 * ht nl cr sp
4154 * 3 : special (must be base64 encoded)
4155 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4156 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004157
Tim Petersced69f82003-09-16 20:30:58 +00004158static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004159char utf7_category[128] = {
4160/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4161 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4162/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4163 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4164/* sp ! " # $ % & ' ( ) * + , - . / */
4165 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4166/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4167 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4168/* @ A B C D E F G H I J K L M N O */
4169 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4170/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4172/* ` a b c d e f g h i j k l m n o */
4173 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4174/* p q r s t u v w x y z { | } ~ del */
4175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004176};
4177
Antoine Pitrou244651a2009-05-04 18:56:13 +00004178/* ENCODE_DIRECT: this character should be encoded as itself. The
4179 * answer depends on whether we are encoding set O as itself, and also
4180 * on whether we are encoding whitespace as itself. RFC2152 makes it
4181 * clear that the answers to these questions vary between
4182 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004183
Antoine Pitrou244651a2009-05-04 18:56:13 +00004184#define ENCODE_DIRECT(c, directO, directWS) \
4185 ((c) < 128 && (c) > 0 && \
4186 ((utf7_category[(c)] == 0) || \
4187 (directWS && (utf7_category[(c)] == 2)) || \
4188 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004189
Alexander Belopolsky40018472011-02-26 01:02:56 +00004190PyObject *
4191PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004192 Py_ssize_t size,
4193 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004194{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004195 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4196}
4197
Antoine Pitrou244651a2009-05-04 18:56:13 +00004198/* The decoder. The only state we preserve is our read position,
4199 * i.e. how many characters we have consumed. So if we end in the
4200 * middle of a shift sequence we have to back off the read position
4201 * and the output to the beginning of the sequence, otherwise we lose
4202 * all the shift state (seen bits, number of bits seen, high
4203 * surrogate). */
4204
Alexander Belopolsky40018472011-02-26 01:02:56 +00004205PyObject *
4206PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004207 Py_ssize_t size,
4208 const char *errors,
4209 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004210{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004212 Py_ssize_t startinpos;
4213 Py_ssize_t endinpos;
4214 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004215 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004216 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004217 const char *errmsg = "";
4218 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004219 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004220 unsigned int base64bits = 0;
4221 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004222 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 PyObject *errorHandler = NULL;
4224 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004225
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004226 /* Start off assuming it's all ASCII. Widen later as necessary. */
4227 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004228 if (!unicode)
4229 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004230 if (size == 0) {
4231 if (consumed)
4232 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004233 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004234 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004235
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004236 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004237 e = s + size;
4238
4239 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004240 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004241 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004242 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004243
Antoine Pitrou244651a2009-05-04 18:56:13 +00004244 if (inShift) { /* in a base-64 section */
4245 if (IS_BASE64(ch)) { /* consume a base-64 character */
4246 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4247 base64bits += 6;
4248 s++;
4249 if (base64bits >= 16) {
4250 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004251 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004252 base64bits -= 16;
4253 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4254 if (surrogate) {
4255 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004256 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4257 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004258 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4259 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004260 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004261 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004262 }
4263 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004264 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4265 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004266 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004267 }
4268 }
Victor Stinner551ac952011-11-29 22:58:13 +01004269 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004270 /* first surrogate */
4271 surrogate = outCh;
4272 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004273 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004274 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4275 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004276 }
4277 }
4278 }
4279 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004280 inShift = 0;
4281 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004283 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4284 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004285 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004287 if (base64bits > 0) { /* left-over bits */
4288 if (base64bits >= 6) {
4289 /* We've seen at least one base-64 character */
4290 errmsg = "partial character in shift sequence";
4291 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004292 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004293 else {
4294 /* Some bits remain; they should be zero */
4295 if (base64buffer != 0) {
4296 errmsg = "non-zero padding bits in shift sequence";
4297 goto utf7Error;
4298 }
4299 }
4300 }
4301 if (ch != '-') {
4302 /* '-' is absorbed; other terminating
4303 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004304 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4305 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004306 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307 }
4308 }
4309 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004311 s++; /* consume '+' */
4312 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004314 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4315 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004316 }
4317 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004318 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004319 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004320 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321 }
4322 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004323 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004324 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4325 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004326 s++;
4327 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328 else {
4329 startinpos = s-starts;
4330 s++;
4331 errmsg = "unexpected special character";
4332 goto utf7Error;
4333 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004335utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 endinpos = s-starts;
4337 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 errors, &errorHandler,
4339 "utf7", errmsg,
4340 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004341 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004342 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343 }
4344
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 /* end of string */
4346
4347 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4348 /* if we're in an inconsistent state, that's an error */
4349 if (surrogate ||
4350 (base64bits >= 6) ||
4351 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 endinpos = size;
4353 if (unicode_decode_call_errorhandler(
4354 errors, &errorHandler,
4355 "utf7", "unterminated shift sequence",
4356 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004357 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 goto onError;
4359 if (s < e)
4360 goto restart;
4361 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004362 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363
4364 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004365 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004366 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004368 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 }
4370 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004371 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004373 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004375 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004376 goto onError;
4377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 Py_XDECREF(errorHandler);
4379 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004380 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381
Benjamin Peterson29060642009-01-31 22:14:21 +00004382 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 Py_XDECREF(errorHandler);
4384 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385 Py_DECREF(unicode);
4386 return NULL;
4387}
4388
4389
Alexander Belopolsky40018472011-02-26 01:02:56 +00004390PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004391_PyUnicode_EncodeUTF7(PyObject *str,
4392 int base64SetO,
4393 int base64WhiteSpace,
4394 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004395{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004396 int kind;
4397 void *data;
4398 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004399 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004400 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004402 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 unsigned int base64bits = 0;
4404 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 char * out;
4406 char * start;
4407
Benjamin Petersonbac79492012-01-14 13:34:47 -05004408 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004409 return NULL;
4410 kind = PyUnicode_KIND(str);
4411 data = PyUnicode_DATA(str);
4412 len = PyUnicode_GET_LENGTH(str);
4413
4414 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004415 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004417 /* It might be possible to tighten this worst case */
4418 allocated = 8 * len;
4419 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004420 return PyErr_NoMemory();
4421
Antoine Pitrou244651a2009-05-04 18:56:13 +00004422 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423 if (v == NULL)
4424 return NULL;
4425
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004426 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004427 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004428 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004429
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430 if (inShift) {
4431 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4432 /* shifting out */
4433 if (base64bits) { /* output remaining bits */
4434 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4435 base64buffer = 0;
4436 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437 }
4438 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 /* Characters not in the BASE64 set implicitly unshift the sequence
4440 so no '-' is required, except if the character is itself a '-' */
4441 if (IS_BASE64(ch) || ch == '-') {
4442 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004444 *out++ = (char) ch;
4445 }
4446 else {
4447 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004448 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004449 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450 else { /* not in a shift sequence */
4451 if (ch == '+') {
4452 *out++ = '+';
4453 *out++ = '-';
4454 }
4455 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4456 *out++ = (char) ch;
4457 }
4458 else {
4459 *out++ = '+';
4460 inShift = 1;
4461 goto encode_char;
4462 }
4463 }
4464 continue;
4465encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004467 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004468
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 /* code first surrogate */
4470 base64bits += 16;
4471 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4472 while (base64bits >= 6) {
4473 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4474 base64bits -= 6;
4475 }
4476 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004477 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004479 base64bits += 16;
4480 base64buffer = (base64buffer << 16) | ch;
4481 while (base64bits >= 6) {
4482 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4483 base64bits -= 6;
4484 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004485 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004486 if (base64bits)
4487 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4488 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004489 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004490 if (_PyBytes_Resize(&v, out - start) < 0)
4491 return NULL;
4492 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004494PyObject *
4495PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4496 Py_ssize_t size,
4497 int base64SetO,
4498 int base64WhiteSpace,
4499 const char *errors)
4500{
4501 PyObject *result;
4502 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4503 if (tmp == NULL)
4504 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004505 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004506 base64WhiteSpace, errors);
4507 Py_DECREF(tmp);
4508 return result;
4509}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004510
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511#undef IS_BASE64
4512#undef FROM_BASE64
4513#undef TO_BASE64
4514#undef DECODE_DIRECT
4515#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517/* --- UTF-8 Codec -------------------------------------------------------- */
4518
Tim Petersced69f82003-09-16 20:30:58 +00004519static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004521 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4522 illegal prefix. See RFC 3629 for details */
4523 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4524 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004525 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4527 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4528 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4529 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004530 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4531 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4533 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004534 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4535 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4536 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4537 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4538 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539};
4540
Alexander Belopolsky40018472011-02-26 01:02:56 +00004541PyObject *
4542PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004543 Py_ssize_t size,
4544 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545{
Walter Dörwald69652032004-09-07 20:24:22 +00004546 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4547}
4548
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004549#include "stringlib/ucs1lib.h"
4550#include "stringlib/codecs.h"
4551#include "stringlib/undef.h"
4552
4553#include "stringlib/ucs2lib.h"
4554#include "stringlib/codecs.h"
4555#include "stringlib/undef.h"
4556
4557#include "stringlib/ucs4lib.h"
4558#include "stringlib/codecs.h"
4559#include "stringlib/undef.h"
4560
Antoine Pitrouab868312009-01-10 15:40:25 +00004561/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4562#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4563
4564/* Mask to quickly check whether a C 'long' contains a
4565 non-ASCII, UTF8-encoded char. */
4566#if (SIZEOF_LONG == 8)
4567# define ASCII_CHAR_MASK 0x8080808080808080L
4568#elif (SIZEOF_LONG == 4)
4569# define ASCII_CHAR_MASK 0x80808080L
4570#else
4571# error C 'long' size should be either 4 or 8!
4572#endif
4573
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004574/* Scans a UTF-8 string and returns the maximum character to be expected
4575 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004576
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004577 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004578 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004579 */
4580static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004581utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004582{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004583 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004584 const unsigned char *end = p + string_size;
4585 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004586
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004587 assert(unicode_size != NULL);
4588
4589 /* By having a cascade of independent loops which fallback onto each
4590 other, we minimize the amount of work done in the average loop
4591 iteration, and we also maximize the CPU's ability to predict
4592 branches correctly (because a given condition will have always the
4593 same boolean outcome except perhaps in the last iteration of the
4594 corresponding loop).
4595 In the general case this brings us rather close to decoding
4596 performance pre-PEP 393, despite the two-pass decoding.
4597
4598 Note that the pure ASCII loop is not duplicated once a non-ASCII
4599 character has been encountered. It is actually a pessimization (by
4600 a significant factor) to use this loop on text with many non-ASCII
4601 characters, and it is important to avoid bad performance on valid
4602 utf-8 data (invalid utf-8 being a different can of worms).
4603 */
4604
4605 /* ASCII */
4606 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004607 /* Only check value if it's not a ASCII char... */
4608 if (*p < 0x80) {
4609 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4610 an explanation. */
4611 if (!((size_t) p & LONG_PTR_MASK)) {
4612 /* Help register allocation */
4613 register const unsigned char *_p = p;
4614 while (_p < aligned_end) {
4615 unsigned long value = *(unsigned long *) _p;
4616 if (value & ASCII_CHAR_MASK)
4617 break;
4618 _p += SIZEOF_LONG;
4619 char_count += SIZEOF_LONG;
4620 }
4621 p = _p;
4622 if (p == end)
4623 break;
4624 }
4625 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004626 if (*p < 0x80)
4627 ++char_count;
4628 else
4629 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004630 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004631 *unicode_size = char_count;
4632 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004633
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004634_ucs1loop:
4635 for (; p < end; ++p) {
4636 if (*p < 0xc4)
4637 char_count += ((*p & 0xc0) != 0x80);
4638 else
4639 goto _ucs2loop;
4640 }
4641 *unicode_size = char_count;
4642 return 255;
4643
4644_ucs2loop:
4645 for (; p < end; ++p) {
4646 if (*p < 0xf0)
4647 char_count += ((*p & 0xc0) != 0x80);
4648 else
4649 goto _ucs4loop;
4650 }
4651 *unicode_size = char_count;
4652 return 65535;
4653
4654_ucs4loop:
4655 for (; p < end; ++p) {
4656 char_count += ((*p & 0xc0) != 0x80);
4657 }
4658 *unicode_size = char_count;
4659 return 65537;
4660}
4661
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004662/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004663 in case of errors. Implicit parameters: unicode, kind, data, onError.
4664 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004665*/
Victor Stinner785938e2011-12-11 20:09:03 +01004666#define WRITE_MAYBE_FAIL(index, value) \
4667 do { \
4668 Py_ssize_t pos = index; \
4669 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4670 unicode_resize(&unicode, pos + pos/8) < 0) \
4671 goto onError; \
4672 if (unicode_putchar(&unicode, &pos, value) < 0) \
4673 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004674 } while (0)
4675
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004676static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004677decode_utf8_errors(const char *starts,
4678 Py_ssize_t size,
4679 const char *errors,
4680 Py_ssize_t *consumed,
4681 const char *s,
4682 PyObject *unicode,
4683 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004684{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004686 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004687 Py_ssize_t startinpos;
4688 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004689 const char *e = starts + size;
4690 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004691 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692 PyObject *errorHandler = NULL;
4693 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004694
Antoine Pitrouab868312009-01-10 15:40:25 +00004695 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696
4697 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004698 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699
4700 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004701 /* Fast path for runs of ASCII characters. Given that common UTF-8
4702 input will consist of an overwhelming majority of ASCII
4703 characters, we try to optimize for this case by checking
4704 as many characters as a C 'long' can contain.
4705 First, check if we can do an aligned read, as most CPUs have
4706 a penalty for unaligned reads.
4707 */
4708 if (!((size_t) s & LONG_PTR_MASK)) {
4709 /* Help register allocation */
4710 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004711 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004712 while (_s < aligned_end) {
4713 /* Read a whole long at a time (either 4 or 8 bytes),
4714 and do a fast unrolled copy if it only contains ASCII
4715 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004716 unsigned long value = *(unsigned long *) _s;
4717 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004718 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004719 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4720 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4721 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4722 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004723#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004724 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4725 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4726 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4727 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004728#endif
4729 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004730 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004731 }
4732 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004733 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004734 if (s == e)
4735 break;
4736 ch = (unsigned char)*s;
4737 }
4738 }
4739
4740 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004741 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 s++;
4743 continue;
4744 }
4745
4746 n = utf8_code_length[ch];
4747
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004748 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004749 if (consumed)
4750 break;
4751 else {
4752 errmsg = "unexpected end of data";
4753 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004754 endinpos = startinpos+1;
4755 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4756 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004757 goto utf8Error;
4758 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760
4761 switch (n) {
4762
4763 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004764 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 startinpos = s-starts;
4766 endinpos = startinpos+1;
4767 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768
4769 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004770 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004771 startinpos = s-starts;
4772 endinpos = startinpos+1;
4773 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774
4775 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004776 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004777 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004779 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 goto utf8Error;
4781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004783 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004784 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 break;
4786
4787 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004788 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4789 will result in surrogates in range d800-dfff. Surrogates are
4790 not valid UTF-8 so they are rejected.
4791 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4792 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004793 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004794 (s[2] & 0xc0) != 0x80 ||
4795 ((unsigned char)s[0] == 0xE0 &&
4796 (unsigned char)s[1] < 0xA0) ||
4797 ((unsigned char)s[0] == 0xED &&
4798 (unsigned char)s[1] > 0x9F)) {
4799 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004801 endinpos = startinpos + 1;
4802
4803 /* if s[1] first two bits are 1 and 0, then the invalid
4804 continuation byte is s[2], so increment endinpos by 1,
4805 if not, s[1] is invalid and endinpos doesn't need to
4806 be incremented. */
4807 if ((s[1] & 0xC0) == 0x80)
4808 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 goto utf8Error;
4810 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004812 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004813 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004814 break;
4815
4816 case 4:
4817 if ((s[1] & 0xc0) != 0x80 ||
4818 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004819 (s[3] & 0xc0) != 0x80 ||
4820 ((unsigned char)s[0] == 0xF0 &&
4821 (unsigned char)s[1] < 0x90) ||
4822 ((unsigned char)s[0] == 0xF4 &&
4823 (unsigned char)s[1] > 0x8F)) {
4824 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004826 endinpos = startinpos + 1;
4827 if ((s[1] & 0xC0) == 0x80) {
4828 endinpos++;
4829 if ((s[2] & 0xC0) == 0x80)
4830 endinpos++;
4831 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004832 goto utf8Error;
4833 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004834 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004835 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004836 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004837
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004838 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 }
4841 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004842 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004843
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 if (unicode_decode_call_errorhandler(
4846 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004847 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004848 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004849 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004850 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004851 /* Update data because unicode_decode_call_errorhandler might have
4852 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004853 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 }
Walter Dörwald69652032004-09-07 20:24:22 +00004855 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004856 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004858 /* Adjust length and ready string when it contained errors and
4859 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004860 if (unicode_resize(&unicode, i) < 0)
4861 goto onError;
4862 unicode_adjust_maxchar(&unicode);
4863 if (unicode == NULL)
4864 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 Py_XDECREF(errorHandler);
4867 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004868 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004869 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870
Benjamin Peterson29060642009-01-31 22:14:21 +00004871 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872 Py_XDECREF(errorHandler);
4873 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004874 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 return NULL;
4876}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004877#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004878
Victor Stinner785938e2011-12-11 20:09:03 +01004879PyObject *
4880PyUnicode_DecodeUTF8Stateful(const char *s,
4881 Py_ssize_t size,
4882 const char *errors,
4883 Py_ssize_t *consumed)
4884{
4885 Py_UCS4 maxchar = 0;
4886 Py_ssize_t unicode_size;
4887 int has_errors = 0;
4888 PyObject *unicode;
4889 int kind;
4890 void *data;
4891 const char *starts = s;
4892 const char *e;
4893 Py_ssize_t i;
4894
4895 if (size == 0) {
4896 if (consumed)
4897 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004898 Py_INCREF(unicode_empty);
4899 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004900 }
4901
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004902 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004903
4904 /* When the string is ASCII only, just use memcpy and return.
4905 unicode_size may be != size if there is an incomplete UTF-8
4906 sequence at the end of the ASCII block. */
4907 if (maxchar < 128 && size == unicode_size) {
4908 if (consumed)
4909 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004910 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004911 }
4912
4913 unicode = PyUnicode_New(unicode_size, maxchar);
4914 if (!unicode)
4915 return NULL;
4916 kind = PyUnicode_KIND(unicode);
4917 data = PyUnicode_DATA(unicode);
4918
4919 /* Unpack UTF-8 encoded data */
4920 i = 0;
4921 e = starts + size;
4922 switch (kind) {
4923 case PyUnicode_1BYTE_KIND:
4924 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4925 break;
4926 case PyUnicode_2BYTE_KIND:
4927 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4928 break;
4929 case PyUnicode_4BYTE_KIND:
4930 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4931 break;
4932 }
4933 if (!has_errors) {
4934 /* Ensure the unicode size calculation was correct */
4935 assert(i == unicode_size);
4936 assert(s == e);
4937 if (consumed)
4938 *consumed = size;
4939 return unicode;
4940 }
4941
4942 /* In case of errors, maxchar and size computation might be incorrect;
4943 code below refits and resizes as necessary. */
4944 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4945}
4946
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004947#ifdef __APPLE__
4948
4949/* Simplified UTF-8 decoder using surrogateescape error handler,
4950 used to decode the command line arguments on Mac OS X. */
4951
4952wchar_t*
4953_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4954{
4955 int n;
4956 const char *e;
4957 wchar_t *unicode, *p;
4958
4959 /* Note: size will always be longer than the resulting Unicode
4960 character count */
4961 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4962 PyErr_NoMemory();
4963 return NULL;
4964 }
4965 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4966 if (!unicode)
4967 return NULL;
4968
4969 /* Unpack UTF-8 encoded data */
4970 p = unicode;
4971 e = s + size;
4972 while (s < e) {
4973 Py_UCS4 ch = (unsigned char)*s;
4974
4975 if (ch < 0x80) {
4976 *p++ = (wchar_t)ch;
4977 s++;
4978 continue;
4979 }
4980
4981 n = utf8_code_length[ch];
4982 if (s + n > e) {
4983 goto surrogateescape;
4984 }
4985
4986 switch (n) {
4987 case 0:
4988 case 1:
4989 goto surrogateescape;
4990
4991 case 2:
4992 if ((s[1] & 0xc0) != 0x80)
4993 goto surrogateescape;
4994 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4995 assert ((ch > 0x007F) && (ch <= 0x07FF));
4996 *p++ = (wchar_t)ch;
4997 break;
4998
4999 case 3:
5000 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5001 will result in surrogates in range d800-dfff. Surrogates are
5002 not valid UTF-8 so they are rejected.
5003 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5004 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5005 if ((s[1] & 0xc0) != 0x80 ||
5006 (s[2] & 0xc0) != 0x80 ||
5007 ((unsigned char)s[0] == 0xE0 &&
5008 (unsigned char)s[1] < 0xA0) ||
5009 ((unsigned char)s[0] == 0xED &&
5010 (unsigned char)s[1] > 0x9F)) {
5011
5012 goto surrogateescape;
5013 }
5014 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5015 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005016 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005017 break;
5018
5019 case 4:
5020 if ((s[1] & 0xc0) != 0x80 ||
5021 (s[2] & 0xc0) != 0x80 ||
5022 (s[3] & 0xc0) != 0x80 ||
5023 ((unsigned char)s[0] == 0xF0 &&
5024 (unsigned char)s[1] < 0x90) ||
5025 ((unsigned char)s[0] == 0xF4 &&
5026 (unsigned char)s[1] > 0x8F)) {
5027 goto surrogateescape;
5028 }
5029 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5030 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005031 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005032
5033#if SIZEOF_WCHAR_T == 4
5034 *p++ = (wchar_t)ch;
5035#else
5036 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005037 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5038 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005039#endif
5040 break;
5041 }
5042 s += n;
5043 continue;
5044
5045 surrogateescape:
5046 *p++ = 0xDC00 + ch;
5047 s++;
5048 }
5049 *p = L'\0';
5050 return unicode;
5051}
5052
5053#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005055/* Primary internal function which creates utf8 encoded bytes objects.
5056
5057 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005058 and allocate exactly as much space needed at the end. Else allocate the
5059 maximum possible needed (4 result bytes per Unicode character), and return
5060 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005061*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005062PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005063_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064{
Victor Stinner6099a032011-12-18 14:22:26 +01005065 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005066 void *data;
5067 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005069 if (!PyUnicode_Check(unicode)) {
5070 PyErr_BadArgument();
5071 return NULL;
5072 }
5073
5074 if (PyUnicode_READY(unicode) == -1)
5075 return NULL;
5076
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005077 if (PyUnicode_UTF8(unicode))
5078 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5079 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005080
5081 kind = PyUnicode_KIND(unicode);
5082 data = PyUnicode_DATA(unicode);
5083 size = PyUnicode_GET_LENGTH(unicode);
5084
Benjamin Petersonead6b532011-12-20 17:23:42 -06005085 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005086 default:
5087 assert(0);
5088 case PyUnicode_1BYTE_KIND:
5089 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5090 assert(!PyUnicode_IS_ASCII(unicode));
5091 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5092 case PyUnicode_2BYTE_KIND:
5093 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5094 case PyUnicode_4BYTE_KIND:
5095 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097}
5098
Alexander Belopolsky40018472011-02-26 01:02:56 +00005099PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005100PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5101 Py_ssize_t size,
5102 const char *errors)
5103{
5104 PyObject *v, *unicode;
5105
5106 unicode = PyUnicode_FromUnicode(s, size);
5107 if (unicode == NULL)
5108 return NULL;
5109 v = _PyUnicode_AsUTF8String(unicode, errors);
5110 Py_DECREF(unicode);
5111 return v;
5112}
5113
5114PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005115PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005117 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118}
5119
Walter Dörwald41980ca2007-08-16 21:55:45 +00005120/* --- UTF-32 Codec ------------------------------------------------------- */
5121
5122PyObject *
5123PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 Py_ssize_t size,
5125 const char *errors,
5126 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005127{
5128 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5129}
5130
5131PyObject *
5132PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005133 Py_ssize_t size,
5134 const char *errors,
5135 int *byteorder,
5136 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005137{
5138 const char *starts = s;
5139 Py_ssize_t startinpos;
5140 Py_ssize_t endinpos;
5141 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005142 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005143 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005144 int bo = 0; /* assume native ordering by default */
5145 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005146 /* Offsets from q for retrieving bytes in the right order. */
5147#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5148 int iorder[] = {0, 1, 2, 3};
5149#else
5150 int iorder[] = {3, 2, 1, 0};
5151#endif
5152 PyObject *errorHandler = NULL;
5153 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005154
Walter Dörwald41980ca2007-08-16 21:55:45 +00005155 q = (unsigned char *)s;
5156 e = q + size;
5157
5158 if (byteorder)
5159 bo = *byteorder;
5160
5161 /* Check for BOM marks (U+FEFF) in the input and adjust current
5162 byte order setting accordingly. In native mode, the leading BOM
5163 mark is skipped, in all other modes, it is copied to the output
5164 stream as-is (giving a ZWNBSP character). */
5165 if (bo == 0) {
5166 if (size >= 4) {
5167 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005169#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 if (bom == 0x0000FEFF) {
5171 q += 4;
5172 bo = -1;
5173 }
5174 else if (bom == 0xFFFE0000) {
5175 q += 4;
5176 bo = 1;
5177 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005178#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 if (bom == 0x0000FEFF) {
5180 q += 4;
5181 bo = 1;
5182 }
5183 else if (bom == 0xFFFE0000) {
5184 q += 4;
5185 bo = -1;
5186 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005187#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005189 }
5190
5191 if (bo == -1) {
5192 /* force LE */
5193 iorder[0] = 0;
5194 iorder[1] = 1;
5195 iorder[2] = 2;
5196 iorder[3] = 3;
5197 }
5198 else if (bo == 1) {
5199 /* force BE */
5200 iorder[0] = 3;
5201 iorder[1] = 2;
5202 iorder[2] = 1;
5203 iorder[3] = 0;
5204 }
5205
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005206 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005207 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005208 if (!unicode)
5209 return NULL;
5210 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005211 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005212 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005213
Walter Dörwald41980ca2007-08-16 21:55:45 +00005214 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005215 Py_UCS4 ch;
5216 /* remaining bytes at the end? (size should be divisible by 4) */
5217 if (e-q<4) {
5218 if (consumed)
5219 break;
5220 errmsg = "truncated data";
5221 startinpos = ((const char *)q)-starts;
5222 endinpos = ((const char *)e)-starts;
5223 goto utf32Error;
5224 /* The remaining input chars are ignored if the callback
5225 chooses to skip the input */
5226 }
5227 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5228 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005229
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 if (ch >= 0x110000)
5231 {
5232 errmsg = "codepoint not in range(0x110000)";
5233 startinpos = ((const char *)q)-starts;
5234 endinpos = startinpos+4;
5235 goto utf32Error;
5236 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005237 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5238 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 q += 4;
5240 continue;
5241 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 if (unicode_decode_call_errorhandler(
5243 errors, &errorHandler,
5244 "utf32", errmsg,
5245 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005246 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005248 }
5249
5250 if (byteorder)
5251 *byteorder = bo;
5252
5253 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005254 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255
5256 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005257 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005258 goto onError;
5259
5260 Py_XDECREF(errorHandler);
5261 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005262 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005263
Benjamin Peterson29060642009-01-31 22:14:21 +00005264 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005265 Py_DECREF(unicode);
5266 Py_XDECREF(errorHandler);
5267 Py_XDECREF(exc);
5268 return NULL;
5269}
5270
5271PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005272_PyUnicode_EncodeUTF32(PyObject *str,
5273 const char *errors,
5274 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005275{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005276 int kind;
5277 void *data;
5278 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005279 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005280 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005281 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005282 /* Offsets from p for storing byte pairs in the right order. */
5283#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5284 int iorder[] = {0, 1, 2, 3};
5285#else
5286 int iorder[] = {3, 2, 1, 0};
5287#endif
5288
Benjamin Peterson29060642009-01-31 22:14:21 +00005289#define STORECHAR(CH) \
5290 do { \
5291 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5292 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5293 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5294 p[iorder[0]] = (CH) & 0xff; \
5295 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005296 } while(0)
5297
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005298 if (!PyUnicode_Check(str)) {
5299 PyErr_BadArgument();
5300 return NULL;
5301 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005302 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005303 return NULL;
5304 kind = PyUnicode_KIND(str);
5305 data = PyUnicode_DATA(str);
5306 len = PyUnicode_GET_LENGTH(str);
5307
5308 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005309 bytesize = nsize * 4;
5310 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005312 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005313 if (v == NULL)
5314 return NULL;
5315
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005316 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005317 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005319 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005320 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005321
5322 if (byteorder == -1) {
5323 /* force LE */
5324 iorder[0] = 0;
5325 iorder[1] = 1;
5326 iorder[2] = 2;
5327 iorder[3] = 3;
5328 }
5329 else if (byteorder == 1) {
5330 /* force BE */
5331 iorder[0] = 3;
5332 iorder[1] = 2;
5333 iorder[2] = 1;
5334 iorder[3] = 0;
5335 }
5336
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005337 for (i = 0; i < len; i++)
5338 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005339
5340 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005341 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005342#undef STORECHAR
5343}
5344
Alexander Belopolsky40018472011-02-26 01:02:56 +00005345PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005346PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5347 Py_ssize_t size,
5348 const char *errors,
5349 int byteorder)
5350{
5351 PyObject *result;
5352 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5353 if (tmp == NULL)
5354 return NULL;
5355 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5356 Py_DECREF(tmp);
5357 return result;
5358}
5359
5360PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005361PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005362{
Victor Stinnerb960b342011-11-20 19:12:52 +01005363 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005364}
5365
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366/* --- UTF-16 Codec ------------------------------------------------------- */
5367
Tim Peters772747b2001-08-09 22:21:55 +00005368PyObject *
5369PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 Py_ssize_t size,
5371 const char *errors,
5372 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373{
Walter Dörwald69652032004-09-07 20:24:22 +00005374 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5375}
5376
Antoine Pitrouab868312009-01-10 15:40:25 +00005377/* Two masks for fast checking of whether a C 'long' may contain
5378 UTF16-encoded surrogate characters. This is an efficient heuristic,
5379 assuming that non-surrogate characters with a code point >= 0x8000 are
5380 rare in most input.
5381 FAST_CHAR_MASK is used when the input is in native byte ordering,
5382 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005383*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005384#if (SIZEOF_LONG == 8)
5385# define FAST_CHAR_MASK 0x8000800080008000L
5386# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5387#elif (SIZEOF_LONG == 4)
5388# define FAST_CHAR_MASK 0x80008000L
5389# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5390#else
5391# error C 'long' size should be either 4 or 8!
5392#endif
5393
Walter Dörwald69652032004-09-07 20:24:22 +00005394PyObject *
5395PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 Py_ssize_t size,
5397 const char *errors,
5398 int *byteorder,
5399 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005400{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005401 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005402 Py_ssize_t startinpos;
5403 Py_ssize_t endinpos;
5404 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005405 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005406 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005407 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005408 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005409 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005410 /* Offsets from q for retrieving byte pairs in the right order. */
5411#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5412 int ihi = 1, ilo = 0;
5413#else
5414 int ihi = 0, ilo = 1;
5415#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005416 PyObject *errorHandler = NULL;
5417 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418
5419 /* Note: size will always be longer than the resulting Unicode
5420 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005421 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 if (!unicode)
5423 return NULL;
5424 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005425 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005426 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427
Tim Peters772747b2001-08-09 22:21:55 +00005428 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005429 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430
5431 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005432 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005434 /* Check for BOM marks (U+FEFF) in the input and adjust current
5435 byte order setting accordingly. In native mode, the leading BOM
5436 mark is skipped, in all other modes, it is copied to the output
5437 stream as-is (giving a ZWNBSP character). */
5438 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005439 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005440 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005441#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 if (bom == 0xFEFF) {
5443 q += 2;
5444 bo = -1;
5445 }
5446 else if (bom == 0xFFFE) {
5447 q += 2;
5448 bo = 1;
5449 }
Tim Petersced69f82003-09-16 20:30:58 +00005450#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005451 if (bom == 0xFEFF) {
5452 q += 2;
5453 bo = 1;
5454 }
5455 else if (bom == 0xFFFE) {
5456 q += 2;
5457 bo = -1;
5458 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005459#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462
Tim Peters772747b2001-08-09 22:21:55 +00005463 if (bo == -1) {
5464 /* force LE */
5465 ihi = 1;
5466 ilo = 0;
5467 }
5468 else if (bo == 1) {
5469 /* force BE */
5470 ihi = 0;
5471 ilo = 1;
5472 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005473#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5474 native_ordering = ilo < ihi;
5475#else
5476 native_ordering = ilo > ihi;
5477#endif
Tim Peters772747b2001-08-09 22:21:55 +00005478
Antoine Pitrouab868312009-01-10 15:40:25 +00005479 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005480 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005481 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005482 /* First check for possible aligned read of a C 'long'. Unaligned
5483 reads are more expensive, better to defer to another iteration. */
5484 if (!((size_t) q & LONG_PTR_MASK)) {
5485 /* Fast path for runs of non-surrogate chars. */
5486 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005487 int kind = PyUnicode_KIND(unicode);
5488 void *data = PyUnicode_DATA(unicode);
5489 while (_q < aligned_end) {
5490 unsigned long block = * (unsigned long *) _q;
5491 unsigned short *pblock = (unsigned short*)&block;
5492 Py_UCS4 maxch;
5493 if (native_ordering) {
5494 /* Can use buffer directly */
5495 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005496 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005497 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005498 else {
5499 /* Need to byte-swap */
5500 unsigned char *_p = (unsigned char*)pblock;
5501 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005502 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005503 _p[0] = _q[1];
5504 _p[1] = _q[0];
5505 _p[2] = _q[3];
5506 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005507#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005508 _p[4] = _q[5];
5509 _p[5] = _q[4];
5510 _p[6] = _q[7];
5511 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005512#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005513 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005514 maxch = Py_MAX(pblock[0], pblock[1]);
5515#if SIZEOF_LONG == 8
5516 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5517#endif
5518 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5519 if (unicode_widen(&unicode, maxch) < 0)
5520 goto onError;
5521 kind = PyUnicode_KIND(unicode);
5522 data = PyUnicode_DATA(unicode);
5523 }
5524 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5525 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5526#if SIZEOF_LONG == 8
5527 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5528 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5529#endif
5530 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005531 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005532 q = _q;
5533 if (q >= e)
5534 break;
5535 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005537
Benjamin Peterson14339b62009-01-31 16:36:08 +00005538 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005539
Victor Stinner551ac952011-11-29 22:58:13 +01005540 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005541 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5542 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 continue;
5544 }
5545
5546 /* UTF-16 code pair: */
5547 if (q > e) {
5548 errmsg = "unexpected end of data";
5549 startinpos = (((const char *)q) - 2) - starts;
5550 endinpos = ((const char *)e) + 1 - starts;
5551 goto utf16Error;
5552 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005553 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5554 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005556 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005557 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005558 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005559 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 continue;
5561 }
5562 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005563 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 startinpos = (((const char *)q)-4)-starts;
5565 endinpos = startinpos+2;
5566 goto utf16Error;
5567 }
5568
Benjamin Peterson14339b62009-01-31 16:36:08 +00005569 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 errmsg = "illegal encoding";
5571 startinpos = (((const char *)q)-2)-starts;
5572 endinpos = startinpos+2;
5573 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005574
Benjamin Peterson29060642009-01-31 22:14:21 +00005575 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005577 errors,
5578 &errorHandler,
5579 "utf16", errmsg,
5580 &starts,
5581 (const char **)&e,
5582 &startinpos,
5583 &endinpos,
5584 &exc,
5585 (const char **)&q,
5586 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005587 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005590 /* remaining byte at the end? (size should be even) */
5591 if (e == q) {
5592 if (!consumed) {
5593 errmsg = "truncated data";
5594 startinpos = ((const char *)q) - starts;
5595 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005596 if (unicode_decode_call_errorhandler(
5597 errors,
5598 &errorHandler,
5599 "utf16", errmsg,
5600 &starts,
5601 (const char **)&e,
5602 &startinpos,
5603 &endinpos,
5604 &exc,
5605 (const char **)&q,
5606 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005607 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005608 goto onError;
5609 /* The remaining input chars are ignored if the callback
5610 chooses to skip the input */
5611 }
5612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613
5614 if (byteorder)
5615 *byteorder = bo;
5616
Walter Dörwald69652032004-09-07 20:24:22 +00005617 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005619
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005621 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 goto onError;
5623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624 Py_XDECREF(errorHandler);
5625 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005626 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 Py_XDECREF(errorHandler);
5631 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 return NULL;
5633}
5634
Antoine Pitrouab868312009-01-10 15:40:25 +00005635#undef FAST_CHAR_MASK
5636#undef SWAPPED_FAST_CHAR_MASK
5637
Tim Peters772747b2001-08-09 22:21:55 +00005638PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005639_PyUnicode_EncodeUTF16(PyObject *str,
5640 const char *errors,
5641 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643 int kind;
5644 void *data;
5645 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005646 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005647 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005648 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005649 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005650 /* Offsets from p for storing byte pairs in the right order. */
5651#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5652 int ihi = 1, ilo = 0;
5653#else
5654 int ihi = 0, ilo = 1;
5655#endif
5656
Benjamin Peterson29060642009-01-31 22:14:21 +00005657#define STORECHAR(CH) \
5658 do { \
5659 p[ihi] = ((CH) >> 8) & 0xff; \
5660 p[ilo] = (CH) & 0xff; \
5661 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005662 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005664 if (!PyUnicode_Check(str)) {
5665 PyErr_BadArgument();
5666 return NULL;
5667 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005668 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005669 return NULL;
5670 kind = PyUnicode_KIND(str);
5671 data = PyUnicode_DATA(str);
5672 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005673
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005674 pairs = 0;
5675 if (kind == PyUnicode_4BYTE_KIND)
5676 for (i = 0; i < len; i++)
5677 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5678 pairs++;
5679 /* 2 * (len + pairs + (byteorder == 0)) */
5680 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005682 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005683 bytesize = nsize * 2;
5684 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005686 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 if (v == NULL)
5688 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005690 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005693 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005694 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005695
5696 if (byteorder == -1) {
5697 /* force LE */
5698 ihi = 1;
5699 ilo = 0;
5700 }
5701 else if (byteorder == 1) {
5702 /* force BE */
5703 ihi = 0;
5704 ilo = 1;
5705 }
5706
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005707 for (i = 0; i < len; i++) {
5708 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5709 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005711 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5712 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 }
Tim Peters772747b2001-08-09 22:21:55 +00005714 STORECHAR(ch);
5715 if (ch2)
5716 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005717 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005718
5719 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005720 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005721#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722}
5723
Alexander Belopolsky40018472011-02-26 01:02:56 +00005724PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005725PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5726 Py_ssize_t size,
5727 const char *errors,
5728 int byteorder)
5729{
5730 PyObject *result;
5731 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5732 if (tmp == NULL)
5733 return NULL;
5734 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5735 Py_DECREF(tmp);
5736 return result;
5737}
5738
5739PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005740PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005742 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743}
5744
5745/* --- Unicode Escape Codec ----------------------------------------------- */
5746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005747/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5748 if all the escapes in the string make it still a valid ASCII string.
5749 Returns -1 if any escapes were found which cause the string to
5750 pop out of ASCII range. Otherwise returns the length of the
5751 required buffer to hold the string.
5752 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005753static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005754length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5755{
5756 const unsigned char *p = (const unsigned char *)s;
5757 const unsigned char *end = p + size;
5758 Py_ssize_t length = 0;
5759
5760 if (size < 0)
5761 return -1;
5762
5763 for (; p < end; ++p) {
5764 if (*p > 127) {
5765 /* Non-ASCII */
5766 return -1;
5767 }
5768 else if (*p != '\\') {
5769 /* Normal character */
5770 ++length;
5771 }
5772 else {
5773 /* Backslash-escape, check next char */
5774 ++p;
5775 /* Escape sequence reaches till end of string or
5776 non-ASCII follow-up. */
5777 if (p >= end || *p > 127)
5778 return -1;
5779 switch (*p) {
5780 case '\n':
5781 /* backslash + \n result in zero characters */
5782 break;
5783 case '\\': case '\'': case '\"':
5784 case 'b': case 'f': case 't':
5785 case 'n': case 'r': case 'v': case 'a':
5786 ++length;
5787 break;
5788 case '0': case '1': case '2': case '3':
5789 case '4': case '5': case '6': case '7':
5790 case 'x': case 'u': case 'U': case 'N':
5791 /* these do not guarantee ASCII characters */
5792 return -1;
5793 default:
5794 /* count the backslash + the other character */
5795 length += 2;
5796 }
5797 }
5798 }
5799 return length;
5800}
5801
Fredrik Lundh06d12682001-01-24 07:59:11 +00005802static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005803
Alexander Belopolsky40018472011-02-26 01:02:56 +00005804PyObject *
5805PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005806 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005807 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005810 Py_ssize_t startinpos;
5811 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005812 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005813 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005815 char* message;
5816 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005817 PyObject *errorHandler = NULL;
5818 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005819 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005820 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005821
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005822 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005823
5824 /* After length_of_escaped_ascii_string() there are two alternatives,
5825 either the string is pure ASCII with named escapes like \n, etc.
5826 and we determined it's exact size (common case)
5827 or it contains \x, \u, ... escape sequences. then we create a
5828 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005829 if (len >= 0) {
5830 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005831 if (!v)
5832 goto onError;
5833 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005834 }
5835 else {
5836 /* Escaped strings will always be longer than the resulting
5837 Unicode string, so we start with size here and then reduce the
5838 length after conversion to the true value.
5839 (but if the error callback returns a long replacement string
5840 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005841 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005842 if (!v)
5843 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005844 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005845 }
5846
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005848 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005849 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005851
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852 while (s < end) {
5853 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005854 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005857 /* The only case in which i == ascii_length is a backslash
5858 followed by a newline. */
5859 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005860
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 /* Non-escape characters are interpreted as Unicode ordinals */
5862 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005863 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5864 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 continue;
5866 }
5867
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 /* \ - Escapes */
5870 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005871 c = *s++;
5872 if (s > end)
5873 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005874
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005875 /* The only case in which i == ascii_length is a backslash
5876 followed by a newline. */
5877 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005878
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005879 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005882#define WRITECHAR(ch) \
5883 do { \
5884 if (unicode_putchar(&v, &i, ch) < 0) \
5885 goto onError; \
5886 }while(0)
5887
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005889 case '\\': WRITECHAR('\\'); break;
5890 case '\'': WRITECHAR('\''); break;
5891 case '\"': WRITECHAR('\"'); break;
5892 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005893 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005894 case 'f': WRITECHAR('\014'); break;
5895 case 't': WRITECHAR('\t'); break;
5896 case 'n': WRITECHAR('\n'); break;
5897 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005898 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005899 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005900 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005901 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 case '0': case '1': case '2': case '3':
5905 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005906 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005907 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005908 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005909 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005910 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005912 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 break;
5914
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 /* hex escapes */
5916 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005918 digits = 2;
5919 message = "truncated \\xXX escape";
5920 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005924 digits = 4;
5925 message = "truncated \\uXXXX escape";
5926 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005929 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005930 digits = 8;
5931 message = "truncated \\UXXXXXXXX escape";
5932 hexescape:
5933 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005934 if (s+digits>end) {
5935 endinpos = size;
5936 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 errors, &errorHandler,
5938 "unicodeescape", "end of string in escape sequence",
5939 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005940 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005941 goto onError;
5942 goto nextByte;
5943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005944 for (j = 0; j < digits; ++j) {
5945 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005946 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005947 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005948 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 errors, &errorHandler,
5950 "unicodeescape", message,
5951 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005952 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005953 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005954 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005955 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005956 }
5957 chr = (chr<<4) & ~0xF;
5958 if (c >= '0' && c <= '9')
5959 chr += c - '0';
5960 else if (c >= 'a' && c <= 'f')
5961 chr += 10 + c - 'a';
5962 else
5963 chr += 10 + c - 'A';
5964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005965 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005966 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967 /* _decoding_error will have already written into the
5968 target buffer. */
5969 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005970 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005971 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005972 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005973 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005974 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005975 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005976 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 errors, &errorHandler,
5978 "unicodeescape", "illegal Unicode character",
5979 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005980 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005981 goto onError;
5982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005983 break;
5984
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005986 case 'N':
5987 message = "malformed \\N character escape";
5988 if (ucnhash_CAPI == NULL) {
5989 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005990 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5991 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005992 if (ucnhash_CAPI == NULL)
5993 goto ucnhashError;
5994 }
5995 if (*s == '{') {
5996 const char *start = s+1;
5997 /* look for the closing brace */
5998 while (*s != '}' && s < end)
5999 s++;
6000 if (s > start && s < end && *s == '}') {
6001 /* found a name. look it up in the unicode database */
6002 message = "unknown Unicode character name";
6003 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006004 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006005 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006006 goto store;
6007 }
6008 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006009 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 errors, &errorHandler,
6012 "unicodeescape", message,
6013 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006014 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006015 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006016 break;
6017
6018 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006019 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006020 message = "\\ at end of string";
6021 s--;
6022 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 errors, &errorHandler,
6025 "unicodeescape", message,
6026 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006027 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006028 goto onError;
6029 }
6030 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006031 WRITECHAR('\\');
6032 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006033 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006034 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006039#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006040
Victor Stinner16e6a802011-12-12 13:24:15 +01006041 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006042 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006043 Py_XDECREF(errorHandler);
6044 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006045 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006046
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006048 PyErr_SetString(
6049 PyExc_UnicodeError,
6050 "\\N escapes not supported (can't load unicodedata module)"
6051 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006052 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053 Py_XDECREF(errorHandler);
6054 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006055 return NULL;
6056
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 Py_XDECREF(errorHandler);
6060 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 return NULL;
6062}
6063
6064/* Return a Unicode-Escape string version of the Unicode object.
6065
6066 If quotes is true, the string is enclosed in u"" or u'' quotes as
6067 appropriate.
6068
6069*/
6070
Alexander Belopolsky40018472011-02-26 01:02:56 +00006071PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006072PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006074 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006075 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006077 int kind;
6078 void *data;
6079 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080
Thomas Wouters89f507f2006-12-13 04:49:30 +00006081 /* Initial allocation is based on the longest-possible unichr
6082 escape.
6083
6084 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6085 unichr, so in this case it's the longest unichr escape. In
6086 narrow (UTF-16) builds this is five chars per source unichr
6087 since there are two unichrs in the surrogate pair, so in narrow
6088 (UTF-16) builds it's not the longest unichr escape.
6089
6090 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6091 so in the narrow (UTF-16) build case it's the longest unichr
6092 escape.
6093 */
6094
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006095 if (!PyUnicode_Check(unicode)) {
6096 PyErr_BadArgument();
6097 return NULL;
6098 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006099 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006100 return NULL;
6101 len = PyUnicode_GET_LENGTH(unicode);
6102 kind = PyUnicode_KIND(unicode);
6103 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006104 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006105 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6106 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6107 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6108 }
6109
6110 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006111 return PyBytes_FromStringAndSize(NULL, 0);
6112
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006113 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006115
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006116 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006118 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 if (repr == NULL)
6121 return NULL;
6122
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006123 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006125 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006126 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006127
Walter Dörwald79e913e2007-05-12 11:08:06 +00006128 /* Escape backslashes */
6129 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 *p++ = '\\';
6131 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006132 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006133 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006134
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006135 /* Map 21-bit characters to '\U00xxxxxx' */
6136 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006137 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006138 *p++ = '\\';
6139 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006140 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6141 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6142 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6143 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6144 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6145 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6146 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6147 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006149 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006150
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006152 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 *p++ = '\\';
6154 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006155 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6156 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6157 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6158 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006160
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006161 /* Map special whitespace to '\t', \n', '\r' */
6162 else if (ch == '\t') {
6163 *p++ = '\\';
6164 *p++ = 't';
6165 }
6166 else if (ch == '\n') {
6167 *p++ = '\\';
6168 *p++ = 'n';
6169 }
6170 else if (ch == '\r') {
6171 *p++ = '\\';
6172 *p++ = 'r';
6173 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006174
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006175 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006176 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006178 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006179 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6180 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006181 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006182
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 /* Copy everything else as-is */
6184 else
6185 *p++ = (char) ch;
6186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006188 assert(p - PyBytes_AS_STRING(repr) > 0);
6189 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6190 return NULL;
6191 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192}
6193
Alexander Belopolsky40018472011-02-26 01:02:56 +00006194PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6196 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006198 PyObject *result;
6199 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6200 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006202 result = PyUnicode_AsUnicodeEscapeString(tmp);
6203 Py_DECREF(tmp);
6204 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205}
6206
6207/* --- Raw Unicode Escape Codec ------------------------------------------- */
6208
Alexander Belopolsky40018472011-02-26 01:02:56 +00006209PyObject *
6210PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006211 Py_ssize_t size,
6212 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006215 Py_ssize_t startinpos;
6216 Py_ssize_t endinpos;
6217 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006218 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 const char *end;
6220 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006221 PyObject *errorHandler = NULL;
6222 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006223
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 /* Escaped strings will always be longer than the resulting
6225 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006226 length after conversion to the true value. (But decoding error
6227 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006228 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006232 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006233 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 end = s + size;
6235 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 unsigned char c;
6237 Py_UCS4 x;
6238 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006239 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 /* Non-escape characters are interpreted as Unicode ordinals */
6242 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006243 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6244 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006246 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 startinpos = s-starts;
6248
6249 /* \u-escapes are only interpreted iff the number of leading
6250 backslashes if odd */
6251 bs = s;
6252 for (;s < end;) {
6253 if (*s != '\\')
6254 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006255 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6256 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 }
6258 if (((s - bs) & 1) == 0 ||
6259 s >= end ||
6260 (*s != 'u' && *s != 'U')) {
6261 continue;
6262 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006263 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 count = *s=='u' ? 4 : 8;
6265 s++;
6266
6267 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 for (x = 0, i = 0; i < count; ++i, ++s) {
6269 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006270 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 endinpos = s-starts;
6272 if (unicode_decode_call_errorhandler(
6273 errors, &errorHandler,
6274 "rawunicodeescape", "truncated \\uXXXX",
6275 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006276 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 goto onError;
6278 goto nextByte;
6279 }
6280 x = (x<<4) & ~0xF;
6281 if (c >= '0' && c <= '9')
6282 x += c - '0';
6283 else if (c >= 'a' && c <= 'f')
6284 x += 10 + c - 'a';
6285 else
6286 x += 10 + c - 'A';
6287 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006288 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006289 if (unicode_putchar(&v, &outpos, x) < 0)
6290 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006291 } else {
6292 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006293 if (unicode_decode_call_errorhandler(
6294 errors, &errorHandler,
6295 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006297 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006299 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 nextByte:
6301 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006303 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006304 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006305 Py_XDECREF(errorHandler);
6306 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006307 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006308
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006311 Py_XDECREF(errorHandler);
6312 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 return NULL;
6314}
6315
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006316
Alexander Belopolsky40018472011-02-26 01:02:56 +00006317PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006318PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006320 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 char *p;
6322 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006323 Py_ssize_t expandsize, pos;
6324 int kind;
6325 void *data;
6326 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006328 if (!PyUnicode_Check(unicode)) {
6329 PyErr_BadArgument();
6330 return NULL;
6331 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006332 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006333 return NULL;
6334 kind = PyUnicode_KIND(unicode);
6335 data = PyUnicode_DATA(unicode);
6336 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006337 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6338 bytes, and 1 byte characters 4. */
6339 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006340
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006341 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006343
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006344 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 if (repr == NULL)
6346 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006347 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006348 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006350 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006351 for (pos = 0; pos < len; pos++) {
6352 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 /* Map 32-bit characters to '\Uxxxxxxxx' */
6354 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006355 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006356 *p++ = '\\';
6357 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006358 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6359 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6360 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6361 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6362 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6363 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6364 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6365 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006366 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006368 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 *p++ = '\\';
6370 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006371 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6372 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6373 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6374 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 /* Copy everything else as-is */
6377 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 *p++ = (char) ch;
6379 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006380
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006381 assert(p > q);
6382 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006383 return NULL;
6384 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385}
6386
Alexander Belopolsky40018472011-02-26 01:02:56 +00006387PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006388PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6389 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006391 PyObject *result;
6392 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6393 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006394 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006395 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6396 Py_DECREF(tmp);
6397 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398}
6399
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006400/* --- Unicode Internal Codec ------------------------------------------- */
6401
Alexander Belopolsky40018472011-02-26 01:02:56 +00006402PyObject *
6403_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006404 Py_ssize_t size,
6405 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006406{
6407 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006408 Py_ssize_t startinpos;
6409 Py_ssize_t endinpos;
6410 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006411 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006412 const char *end;
6413 const char *reason;
6414 PyObject *errorHandler = NULL;
6415 PyObject *exc = NULL;
6416
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006417 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006418 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006419 1))
6420 return NULL;
6421
Thomas Wouters89f507f2006-12-13 04:49:30 +00006422 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006423 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006424 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006426 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006427 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006428 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006429 end = s + size;
6430
6431 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006432 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006433 Py_UCS4 ch;
6434 /* We copy the raw representation one byte at a time because the
6435 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006436 ((char *) &uch)[0] = s[0];
6437 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006438#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006439 ((char *) &uch)[2] = s[2];
6440 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006441#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006442 ch = uch;
6443
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006444 /* We have to sanity check the raw data, otherwise doom looms for
6445 some malformed UCS-4 data. */
6446 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006447#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006448 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006449#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006450 end-s < Py_UNICODE_SIZE
6451 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006453 startinpos = s - starts;
6454 if (end-s < Py_UNICODE_SIZE) {
6455 endinpos = end-starts;
6456 reason = "truncated input";
6457 }
6458 else {
6459 endinpos = s - starts + Py_UNICODE_SIZE;
6460 reason = "illegal code point (> 0x10FFFF)";
6461 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006462 if (unicode_decode_call_errorhandler(
6463 errors, &errorHandler,
6464 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006465 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006466 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006467 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006468 continue;
6469 }
6470
6471 s += Py_UNICODE_SIZE;
6472#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006473 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006474 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006475 Py_UNICODE uch2;
6476 ((char *) &uch2)[0] = s[0];
6477 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006478 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006479 {
Victor Stinner551ac952011-11-29 22:58:13 +01006480 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006481 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006482 }
6483 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006484#endif
6485
6486 if (unicode_putchar(&v, &outpos, ch) < 0)
6487 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006488 }
6489
Victor Stinner16e6a802011-12-12 13:24:15 +01006490 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006491 goto onError;
6492 Py_XDECREF(errorHandler);
6493 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006494 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006495
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006497 Py_XDECREF(v);
6498 Py_XDECREF(errorHandler);
6499 Py_XDECREF(exc);
6500 return NULL;
6501}
6502
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503/* --- Latin-1 Codec ------------------------------------------------------ */
6504
Alexander Belopolsky40018472011-02-26 01:02:56 +00006505PyObject *
6506PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006507 Py_ssize_t size,
6508 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006511 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512}
6513
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006514/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006515static void
6516make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006517 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006518 PyObject *unicode,
6519 Py_ssize_t startpos, Py_ssize_t endpos,
6520 const char *reason)
6521{
6522 if (*exceptionObject == NULL) {
6523 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006524 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006525 encoding, unicode, startpos, endpos, reason);
6526 }
6527 else {
6528 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6529 goto onError;
6530 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6531 goto onError;
6532 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6533 goto onError;
6534 return;
6535 onError:
6536 Py_DECREF(*exceptionObject);
6537 *exceptionObject = NULL;
6538 }
6539}
6540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006541/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006542static void
6543raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006544 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006545 PyObject *unicode,
6546 Py_ssize_t startpos, Py_ssize_t endpos,
6547 const char *reason)
6548{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006549 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006550 encoding, unicode, startpos, endpos, reason);
6551 if (*exceptionObject != NULL)
6552 PyCodec_StrictErrors(*exceptionObject);
6553}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006554
6555/* error handling callback helper:
6556 build arguments, call the callback and check the arguments,
6557 put the result into newpos and return the replacement string, which
6558 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006559static PyObject *
6560unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006561 PyObject **errorHandler,
6562 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006563 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006564 Py_ssize_t startpos, Py_ssize_t endpos,
6565 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006566{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006567 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006568 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006569 PyObject *restuple;
6570 PyObject *resunicode;
6571
6572 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006574 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006576 }
6577
Benjamin Petersonbac79492012-01-14 13:34:47 -05006578 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006579 return NULL;
6580 len = PyUnicode_GET_LENGTH(unicode);
6581
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006582 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006583 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006584 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006586
6587 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006589 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006591 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006592 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 Py_DECREF(restuple);
6594 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006595 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006596 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 &resunicode, newpos)) {
6598 Py_DECREF(restuple);
6599 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006601 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6602 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6603 Py_DECREF(restuple);
6604 return NULL;
6605 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006607 *newpos = len + *newpos;
6608 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6610 Py_DECREF(restuple);
6611 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006612 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006613 Py_INCREF(resunicode);
6614 Py_DECREF(restuple);
6615 return resunicode;
6616}
6617
Alexander Belopolsky40018472011-02-26 01:02:56 +00006618static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006620 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006621 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 /* input state */
6624 Py_ssize_t pos=0, size;
6625 int kind;
6626 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006627 /* output object */
6628 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006629 /* pointer into the output */
6630 char *str;
6631 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006632 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006633 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6634 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635 PyObject *errorHandler = NULL;
6636 PyObject *exc = NULL;
6637 /* the following variable is used for caching string comparisons
6638 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6639 int known_errorHandler = -1;
6640
Benjamin Petersonbac79492012-01-14 13:34:47 -05006641 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006642 return NULL;
6643 size = PyUnicode_GET_LENGTH(unicode);
6644 kind = PyUnicode_KIND(unicode);
6645 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 /* allocate enough for a simple encoding without
6647 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006648 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006649 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006650 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006652 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006653 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006654 ressize = size;
6655
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006656 while (pos < size) {
6657 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 /* can we encode this? */
6660 if (c<limit) {
6661 /* no overflow check, because we know that the space is enough */
6662 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006664 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 Py_ssize_t requiredsize;
6667 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006668 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006670 Py_ssize_t collstart = pos;
6671 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006673 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 ++collend;
6675 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6676 if (known_errorHandler==-1) {
6677 if ((errors==NULL) || (!strcmp(errors, "strict")))
6678 known_errorHandler = 1;
6679 else if (!strcmp(errors, "replace"))
6680 known_errorHandler = 2;
6681 else if (!strcmp(errors, "ignore"))
6682 known_errorHandler = 3;
6683 else if (!strcmp(errors, "xmlcharrefreplace"))
6684 known_errorHandler = 4;
6685 else
6686 known_errorHandler = 0;
6687 }
6688 switch (known_errorHandler) {
6689 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006690 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 goto onError;
6692 case 2: /* replace */
6693 while (collstart++<collend)
6694 *str++ = '?'; /* fall through */
6695 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006696 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006697 break;
6698 case 4: /* xmlcharrefreplace */
6699 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006700 /* determine replacement size */
6701 for (i = collstart, repsize = 0; i < collend; ++i) {
6702 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6703 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006705 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006707 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006709 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006710 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006711 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006712 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006713 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006715 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006716 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006718 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 if (requiredsize > ressize) {
6722 if (requiredsize<2*ressize)
6723 requiredsize = 2*ressize;
6724 if (_PyBytes_Resize(&res, requiredsize))
6725 goto onError;
6726 str = PyBytes_AS_STRING(res) + respos;
6727 ressize = requiredsize;
6728 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006729 /* generate replacement */
6730 for (i = collstart; i < collend; ++i) {
6731 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 break;
6735 default:
6736 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006737 encoding, reason, unicode, &exc,
6738 collstart, collend, &newpos);
6739 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006740 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006742 if (PyBytes_Check(repunicode)) {
6743 /* Directly copy bytes result to output. */
6744 repsize = PyBytes_Size(repunicode);
6745 if (repsize > 1) {
6746 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006747 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006748 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6749 Py_DECREF(repunicode);
6750 goto onError;
6751 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006752 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006753 ressize += repsize-1;
6754 }
6755 memcpy(str, PyBytes_AsString(repunicode), repsize);
6756 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006757 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006758 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006759 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006760 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 /* need more space? (at least enough for what we
6762 have+the replacement+the rest of the string, so
6763 we won't have to check space for encodable characters) */
6764 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006765 repsize = PyUnicode_GET_LENGTH(repunicode);
6766 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 if (requiredsize > ressize) {
6768 if (requiredsize<2*ressize)
6769 requiredsize = 2*ressize;
6770 if (_PyBytes_Resize(&res, requiredsize)) {
6771 Py_DECREF(repunicode);
6772 goto onError;
6773 }
6774 str = PyBytes_AS_STRING(res) + respos;
6775 ressize = requiredsize;
6776 }
6777 /* check if there is anything unencodable in the replacement
6778 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006779 for (i = 0; repsize-->0; ++i, ++str) {
6780 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006782 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006783 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 Py_DECREF(repunicode);
6785 goto onError;
6786 }
6787 *str = (char)c;
6788 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006789 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006790 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006791 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006792 }
6793 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006794 /* Resize if we allocated to much */
6795 size = str - PyBytes_AS_STRING(res);
6796 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006797 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006798 if (_PyBytes_Resize(&res, size) < 0)
6799 goto onError;
6800 }
6801
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006802 Py_XDECREF(errorHandler);
6803 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006804 return res;
6805
6806 onError:
6807 Py_XDECREF(res);
6808 Py_XDECREF(errorHandler);
6809 Py_XDECREF(exc);
6810 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006811}
6812
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006813/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006814PyObject *
6815PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006816 Py_ssize_t size,
6817 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006819 PyObject *result;
6820 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6821 if (unicode == NULL)
6822 return NULL;
6823 result = unicode_encode_ucs1(unicode, errors, 256);
6824 Py_DECREF(unicode);
6825 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826}
6827
Alexander Belopolsky40018472011-02-26 01:02:56 +00006828PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006829_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830{
6831 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 PyErr_BadArgument();
6833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006835 if (PyUnicode_READY(unicode) == -1)
6836 return NULL;
6837 /* Fast path: if it is a one-byte string, construct
6838 bytes object directly. */
6839 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6840 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6841 PyUnicode_GET_LENGTH(unicode));
6842 /* Non-Latin-1 characters present. Defer to above function to
6843 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006844 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006845}
6846
6847PyObject*
6848PyUnicode_AsLatin1String(PyObject *unicode)
6849{
6850 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851}
6852
6853/* --- 7-bit ASCII Codec -------------------------------------------------- */
6854
Alexander Belopolsky40018472011-02-26 01:02:56 +00006855PyObject *
6856PyUnicode_DecodeASCII(const char *s,
6857 Py_ssize_t size,
6858 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006860 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006861 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006862 int kind;
6863 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006864 Py_ssize_t startinpos;
6865 Py_ssize_t endinpos;
6866 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006867 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006868 int has_error;
6869 const unsigned char *p = (const unsigned char *)s;
6870 const unsigned char *end = p + size;
6871 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872 PyObject *errorHandler = NULL;
6873 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006874
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006875 if (size == 0) {
6876 Py_INCREF(unicode_empty);
6877 return unicode_empty;
6878 }
6879
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006881 if (size == 1 && (unsigned char)s[0] < 128)
6882 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006883
Victor Stinner702c7342011-10-05 13:50:52 +02006884 has_error = 0;
6885 while (p < end && !has_error) {
6886 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6887 an explanation. */
6888 if (!((size_t) p & LONG_PTR_MASK)) {
6889 /* Help register allocation */
6890 register const unsigned char *_p = p;
6891 while (_p < aligned_end) {
6892 unsigned long value = *(unsigned long *) _p;
6893 if (value & ASCII_CHAR_MASK) {
6894 has_error = 1;
6895 break;
6896 }
6897 _p += SIZEOF_LONG;
6898 }
6899 if (_p == end)
6900 break;
6901 if (has_error)
6902 break;
6903 p = _p;
6904 }
6905 if (*p & 0x80) {
6906 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006907 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006908 }
6909 else {
6910 ++p;
6911 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006912 }
Victor Stinner702c7342011-10-05 13:50:52 +02006913 if (!has_error)
6914 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006915
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006916 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006920 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006921 kind = PyUnicode_KIND(v);
6922 data = PyUnicode_DATA(v);
6923 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006924 e = s + size;
6925 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 register unsigned char c = (unsigned char)*s;
6927 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006928 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 ++s;
6930 }
6931 else {
6932 startinpos = s-starts;
6933 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 if (unicode_decode_call_errorhandler(
6935 errors, &errorHandler,
6936 "ascii", "ordinal not in range(128)",
6937 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006938 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006940 kind = PyUnicode_KIND(v);
6941 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006944 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006945 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 Py_XDECREF(errorHandler);
6947 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006948 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006949 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006950
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006953 Py_XDECREF(errorHandler);
6954 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 return NULL;
6956}
6957
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006958/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006959PyObject *
6960PyUnicode_EncodeASCII(const Py_UNICODE *p,
6961 Py_ssize_t size,
6962 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006964 PyObject *result;
6965 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6966 if (unicode == NULL)
6967 return NULL;
6968 result = unicode_encode_ucs1(unicode, errors, 128);
6969 Py_DECREF(unicode);
6970 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971}
6972
Alexander Belopolsky40018472011-02-26 01:02:56 +00006973PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006974_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975{
6976 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 PyErr_BadArgument();
6978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006980 if (PyUnicode_READY(unicode) == -1)
6981 return NULL;
6982 /* Fast path: if it is an ASCII-only string, construct bytes object
6983 directly. Else defer to above function to raise the exception. */
6984 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6985 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6986 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006987 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006988}
6989
6990PyObject *
6991PyUnicode_AsASCIIString(PyObject *unicode)
6992{
6993 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994}
6995
Victor Stinner99b95382011-07-04 14:23:54 +02006996#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006997
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006998/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006999
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007000#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001#define NEED_RETRY
7002#endif
7003
Victor Stinner3a50e702011-10-18 21:21:00 +02007004#ifndef WC_ERR_INVALID_CHARS
7005# define WC_ERR_INVALID_CHARS 0x0080
7006#endif
7007
7008static char*
7009code_page_name(UINT code_page, PyObject **obj)
7010{
7011 *obj = NULL;
7012 if (code_page == CP_ACP)
7013 return "mbcs";
7014 if (code_page == CP_UTF7)
7015 return "CP_UTF7";
7016 if (code_page == CP_UTF8)
7017 return "CP_UTF8";
7018
7019 *obj = PyBytes_FromFormat("cp%u", code_page);
7020 if (*obj == NULL)
7021 return NULL;
7022 return PyBytes_AS_STRING(*obj);
7023}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007024
Alexander Belopolsky40018472011-02-26 01:02:56 +00007025static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007026is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007027{
7028 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007029 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030
Victor Stinner3a50e702011-10-18 21:21:00 +02007031 if (!IsDBCSLeadByteEx(code_page, *curr))
7032 return 0;
7033
7034 prev = CharPrevExA(code_page, s, curr, 0);
7035 if (prev == curr)
7036 return 1;
7037 /* FIXME: This code is limited to "true" double-byte encodings,
7038 as it assumes an incomplete character consists of a single
7039 byte. */
7040 if (curr - prev == 2)
7041 return 1;
7042 if (!IsDBCSLeadByteEx(code_page, *prev))
7043 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007044 return 0;
7045}
7046
Victor Stinner3a50e702011-10-18 21:21:00 +02007047static DWORD
7048decode_code_page_flags(UINT code_page)
7049{
7050 if (code_page == CP_UTF7) {
7051 /* The CP_UTF7 decoder only supports flags=0 */
7052 return 0;
7053 }
7054 else
7055 return MB_ERR_INVALID_CHARS;
7056}
7057
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007059 * Decode a byte string from a Windows code page into unicode object in strict
7060 * mode.
7061 *
7062 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7063 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007065static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007066decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007067 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007068 const char *in,
7069 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007070{
Victor Stinner3a50e702011-10-18 21:21:00 +02007071 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007072 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007073 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007074
7075 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007076 assert(insize > 0);
7077 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7078 if (outsize <= 0)
7079 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007080
7081 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007082 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007083 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007084 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 if (*v == NULL)
7086 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007087 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088 }
7089 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007091 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007092 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007093 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007094 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095 }
7096
7097 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007098 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7099 if (outsize <= 0)
7100 goto error;
7101 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007102
Victor Stinner3a50e702011-10-18 21:21:00 +02007103error:
7104 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7105 return -2;
7106 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007107 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007108}
7109
Victor Stinner3a50e702011-10-18 21:21:00 +02007110/*
7111 * Decode a byte string from a code page into unicode object with an error
7112 * handler.
7113 *
7114 * Returns consumed size if succeed, or raise a WindowsError or
7115 * UnicodeDecodeError exception and returns -1 on error.
7116 */
7117static int
7118decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007119 PyObject **v,
7120 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 const char *errors)
7122{
7123 const char *startin = in;
7124 const char *endin = in + size;
7125 const DWORD flags = decode_code_page_flags(code_page);
7126 /* Ideally, we should get reason from FormatMessage. This is the Windows
7127 2000 English version of the message. */
7128 const char *reason = "No mapping for the Unicode character exists "
7129 "in the target code page.";
7130 /* each step cannot decode more than 1 character, but a character can be
7131 represented as a surrogate pair */
7132 wchar_t buffer[2], *startout, *out;
7133 int insize, outsize;
7134 PyObject *errorHandler = NULL;
7135 PyObject *exc = NULL;
7136 PyObject *encoding_obj = NULL;
7137 char *encoding;
7138 DWORD err;
7139 int ret = -1;
7140
7141 assert(size > 0);
7142
7143 encoding = code_page_name(code_page, &encoding_obj);
7144 if (encoding == NULL)
7145 return -1;
7146
7147 if (errors == NULL || strcmp(errors, "strict") == 0) {
7148 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7149 UnicodeDecodeError. */
7150 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7151 if (exc != NULL) {
7152 PyCodec_StrictErrors(exc);
7153 Py_CLEAR(exc);
7154 }
7155 goto error;
7156 }
7157
7158 if (*v == NULL) {
7159 /* Create unicode object */
7160 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7161 PyErr_NoMemory();
7162 goto error;
7163 }
Victor Stinnerab595942011-12-17 04:59:06 +01007164 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007165 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 if (*v == NULL)
7167 goto error;
7168 startout = PyUnicode_AS_UNICODE(*v);
7169 }
7170 else {
7171 /* Extend unicode object */
7172 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7173 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7174 PyErr_NoMemory();
7175 goto error;
7176 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007177 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 goto error;
7179 startout = PyUnicode_AS_UNICODE(*v) + n;
7180 }
7181
7182 /* Decode the byte string character per character */
7183 out = startout;
7184 while (in < endin)
7185 {
7186 /* Decode a character */
7187 insize = 1;
7188 do
7189 {
7190 outsize = MultiByteToWideChar(code_page, flags,
7191 in, insize,
7192 buffer, Py_ARRAY_LENGTH(buffer));
7193 if (outsize > 0)
7194 break;
7195 err = GetLastError();
7196 if (err != ERROR_NO_UNICODE_TRANSLATION
7197 && err != ERROR_INSUFFICIENT_BUFFER)
7198 {
7199 PyErr_SetFromWindowsErr(0);
7200 goto error;
7201 }
7202 insize++;
7203 }
7204 /* 4=maximum length of a UTF-8 sequence */
7205 while (insize <= 4 && (in + insize) <= endin);
7206
7207 if (outsize <= 0) {
7208 Py_ssize_t startinpos, endinpos, outpos;
7209
7210 startinpos = in - startin;
7211 endinpos = startinpos + 1;
7212 outpos = out - PyUnicode_AS_UNICODE(*v);
7213 if (unicode_decode_call_errorhandler(
7214 errors, &errorHandler,
7215 encoding, reason,
7216 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007217 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 {
7219 goto error;
7220 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007221 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 }
7223 else {
7224 in += insize;
7225 memcpy(out, buffer, outsize * sizeof(wchar_t));
7226 out += outsize;
7227 }
7228 }
7229
7230 /* write a NUL character at the end */
7231 *out = 0;
7232
7233 /* Extend unicode object */
7234 outsize = out - startout;
7235 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007236 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007237 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007238 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007239
7240error:
7241 Py_XDECREF(encoding_obj);
7242 Py_XDECREF(errorHandler);
7243 Py_XDECREF(exc);
7244 return ret;
7245}
7246
Victor Stinner3a50e702011-10-18 21:21:00 +02007247static PyObject *
7248decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007249 const char *s, Py_ssize_t size,
7250 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007251{
Victor Stinner76a31a62011-11-04 00:05:13 +01007252 PyObject *v = NULL;
7253 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007254
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 if (code_page < 0) {
7256 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7257 return NULL;
7258 }
7259
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007260 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007262
Victor Stinner76a31a62011-11-04 00:05:13 +01007263 do
7264 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007265#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007266 if (size > INT_MAX) {
7267 chunk_size = INT_MAX;
7268 final = 0;
7269 done = 0;
7270 }
7271 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007272#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007273 {
7274 chunk_size = (int)size;
7275 final = (consumed == NULL);
7276 done = 1;
7277 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007278
Victor Stinner76a31a62011-11-04 00:05:13 +01007279 /* Skip trailing lead-byte unless 'final' is set */
7280 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7281 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007282
Victor Stinner76a31a62011-11-04 00:05:13 +01007283 if (chunk_size == 0 && done) {
7284 if (v != NULL)
7285 break;
7286 Py_INCREF(unicode_empty);
7287 return unicode_empty;
7288 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007289
Victor Stinner76a31a62011-11-04 00:05:13 +01007290
7291 converted = decode_code_page_strict(code_page, &v,
7292 s, chunk_size);
7293 if (converted == -2)
7294 converted = decode_code_page_errors(code_page, &v,
7295 s, chunk_size,
7296 errors);
7297 assert(converted != 0);
7298
7299 if (converted < 0) {
7300 Py_XDECREF(v);
7301 return NULL;
7302 }
7303
7304 if (consumed)
7305 *consumed += converted;
7306
7307 s += converted;
7308 size -= converted;
7309 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007310
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007311 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007312}
7313
Alexander Belopolsky40018472011-02-26 01:02:56 +00007314PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007315PyUnicode_DecodeCodePageStateful(int code_page,
7316 const char *s,
7317 Py_ssize_t size,
7318 const char *errors,
7319 Py_ssize_t *consumed)
7320{
7321 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7322}
7323
7324PyObject *
7325PyUnicode_DecodeMBCSStateful(const char *s,
7326 Py_ssize_t size,
7327 const char *errors,
7328 Py_ssize_t *consumed)
7329{
7330 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7331}
7332
7333PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007334PyUnicode_DecodeMBCS(const char *s,
7335 Py_ssize_t size,
7336 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007337{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7339}
7340
Victor Stinner3a50e702011-10-18 21:21:00 +02007341static DWORD
7342encode_code_page_flags(UINT code_page, const char *errors)
7343{
7344 if (code_page == CP_UTF8) {
7345 if (winver.dwMajorVersion >= 6)
7346 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7347 and later */
7348 return WC_ERR_INVALID_CHARS;
7349 else
7350 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7351 return 0;
7352 }
7353 else if (code_page == CP_UTF7) {
7354 /* CP_UTF7 only supports flags=0 */
7355 return 0;
7356 }
7357 else {
7358 if (errors != NULL && strcmp(errors, "replace") == 0)
7359 return 0;
7360 else
7361 return WC_NO_BEST_FIT_CHARS;
7362 }
7363}
7364
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007365/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007366 * Encode a Unicode string to a Windows code page into a byte string in strict
7367 * mode.
7368 *
7369 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7370 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007372static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007373encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007374 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007375 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007376{
Victor Stinner554f3f02010-06-16 23:33:54 +00007377 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007378 BOOL *pusedDefaultChar = &usedDefaultChar;
7379 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007380 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007381 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007382 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007383 const DWORD flags = encode_code_page_flags(code_page, NULL);
7384 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007385 /* Create a substring so that we can get the UTF-16 representation
7386 of just the slice under consideration. */
7387 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007388
Martin v. Löwis3d325192011-11-04 18:23:06 +01007389 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007390
Victor Stinner3a50e702011-10-18 21:21:00 +02007391 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007392 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007394 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007395
Victor Stinner2fc507f2011-11-04 20:06:39 +01007396 substring = PyUnicode_Substring(unicode, offset, offset+len);
7397 if (substring == NULL)
7398 return -1;
7399 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7400 if (p == NULL) {
7401 Py_DECREF(substring);
7402 return -1;
7403 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007404
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007405 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 outsize = WideCharToMultiByte(code_page, flags,
7407 p, size,
7408 NULL, 0,
7409 NULL, pusedDefaultChar);
7410 if (outsize <= 0)
7411 goto error;
7412 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007413 if (pusedDefaultChar && *pusedDefaultChar) {
7414 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007415 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007416 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007417
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007420 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007421 if (*outbytes == NULL) {
7422 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007424 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007426 }
7427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 const Py_ssize_t n = PyBytes_Size(*outbytes);
7430 if (outsize > PY_SSIZE_T_MAX - n) {
7431 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007432 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007435 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7436 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007438 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007440 }
7441
7442 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 outsize = WideCharToMultiByte(code_page, flags,
7444 p, size,
7445 out, outsize,
7446 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007447 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 if (outsize <= 0)
7449 goto error;
7450 if (pusedDefaultChar && *pusedDefaultChar)
7451 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007452 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007453
Victor Stinner3a50e702011-10-18 21:21:00 +02007454error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007455 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7457 return -2;
7458 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007459 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007460}
7461
Victor Stinner3a50e702011-10-18 21:21:00 +02007462/*
7463 * Encode a Unicode string to a Windows code page into a byte string using a
7464 * error handler.
7465 *
7466 * Returns consumed characters if succeed, or raise a WindowsError and returns
7467 * -1 on other error.
7468 */
7469static int
7470encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007471 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007472 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007473{
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007475 Py_ssize_t pos = unicode_offset;
7476 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007477 /* Ideally, we should get reason from FormatMessage. This is the Windows
7478 2000 English version of the message. */
7479 const char *reason = "invalid character";
7480 /* 4=maximum length of a UTF-8 sequence */
7481 char buffer[4];
7482 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7483 Py_ssize_t outsize;
7484 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 PyObject *errorHandler = NULL;
7486 PyObject *exc = NULL;
7487 PyObject *encoding_obj = NULL;
7488 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007489 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 PyObject *rep;
7491 int ret = -1;
7492
7493 assert(insize > 0);
7494
7495 encoding = code_page_name(code_page, &encoding_obj);
7496 if (encoding == NULL)
7497 return -1;
7498
7499 if (errors == NULL || strcmp(errors, "strict") == 0) {
7500 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7501 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007502 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 if (exc != NULL) {
7504 PyCodec_StrictErrors(exc);
7505 Py_DECREF(exc);
7506 }
7507 Py_XDECREF(encoding_obj);
7508 return -1;
7509 }
7510
7511 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7512 pusedDefaultChar = &usedDefaultChar;
7513 else
7514 pusedDefaultChar = NULL;
7515
7516 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7517 PyErr_NoMemory();
7518 goto error;
7519 }
7520 outsize = insize * Py_ARRAY_LENGTH(buffer);
7521
7522 if (*outbytes == NULL) {
7523 /* Create string object */
7524 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7525 if (*outbytes == NULL)
7526 goto error;
7527 out = PyBytes_AS_STRING(*outbytes);
7528 }
7529 else {
7530 /* Extend string object */
7531 Py_ssize_t n = PyBytes_Size(*outbytes);
7532 if (n > PY_SSIZE_T_MAX - outsize) {
7533 PyErr_NoMemory();
7534 goto error;
7535 }
7536 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7537 goto error;
7538 out = PyBytes_AS_STRING(*outbytes) + n;
7539 }
7540
7541 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007542 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007543 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007544 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7545 wchar_t chars[2];
7546 int charsize;
7547 if (ch < 0x10000) {
7548 chars[0] = (wchar_t)ch;
7549 charsize = 1;
7550 }
7551 else {
7552 ch -= 0x10000;
7553 chars[0] = 0xd800 + (ch >> 10);
7554 chars[1] = 0xdc00 + (ch & 0x3ff);
7555 charsize = 2;
7556 }
7557
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007559 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007560 buffer, Py_ARRAY_LENGTH(buffer),
7561 NULL, pusedDefaultChar);
7562 if (outsize > 0) {
7563 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7564 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007565 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007566 memcpy(out, buffer, outsize);
7567 out += outsize;
7568 continue;
7569 }
7570 }
7571 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7572 PyErr_SetFromWindowsErr(0);
7573 goto error;
7574 }
7575
Victor Stinner3a50e702011-10-18 21:21:00 +02007576 rep = unicode_encode_call_errorhandler(
7577 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007578 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007579 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 if (rep == NULL)
7581 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007582 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007583
7584 if (PyBytes_Check(rep)) {
7585 outsize = PyBytes_GET_SIZE(rep);
7586 if (outsize != 1) {
7587 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7588 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7589 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7590 Py_DECREF(rep);
7591 goto error;
7592 }
7593 out = PyBytes_AS_STRING(*outbytes) + offset;
7594 }
7595 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7596 out += outsize;
7597 }
7598 else {
7599 Py_ssize_t i;
7600 enum PyUnicode_Kind kind;
7601 void *data;
7602
Benjamin Petersonbac79492012-01-14 13:34:47 -05007603 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 Py_DECREF(rep);
7605 goto error;
7606 }
7607
7608 outsize = PyUnicode_GET_LENGTH(rep);
7609 if (outsize != 1) {
7610 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7611 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7612 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7613 Py_DECREF(rep);
7614 goto error;
7615 }
7616 out = PyBytes_AS_STRING(*outbytes) + offset;
7617 }
7618 kind = PyUnicode_KIND(rep);
7619 data = PyUnicode_DATA(rep);
7620 for (i=0; i < outsize; i++) {
7621 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7622 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007623 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007624 encoding, unicode,
7625 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 "unable to encode error handler result to ASCII");
7627 Py_DECREF(rep);
7628 goto error;
7629 }
7630 *out = (unsigned char)ch;
7631 out++;
7632 }
7633 }
7634 Py_DECREF(rep);
7635 }
7636 /* write a NUL byte */
7637 *out = 0;
7638 outsize = out - PyBytes_AS_STRING(*outbytes);
7639 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7640 if (_PyBytes_Resize(outbytes, outsize) < 0)
7641 goto error;
7642 ret = 0;
7643
7644error:
7645 Py_XDECREF(encoding_obj);
7646 Py_XDECREF(errorHandler);
7647 Py_XDECREF(exc);
7648 return ret;
7649}
7650
Victor Stinner3a50e702011-10-18 21:21:00 +02007651static PyObject *
7652encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007653 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007654 const char *errors)
7655{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007656 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007657 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007658 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007659 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007660
Benjamin Petersonbac79492012-01-14 13:34:47 -05007661 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007662 return NULL;
7663 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007664
Victor Stinner3a50e702011-10-18 21:21:00 +02007665 if (code_page < 0) {
7666 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7667 return NULL;
7668 }
7669
Martin v. Löwis3d325192011-11-04 18:23:06 +01007670 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007671 return PyBytes_FromStringAndSize(NULL, 0);
7672
Victor Stinner7581cef2011-11-03 22:32:33 +01007673 offset = 0;
7674 do
7675 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007676#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007677 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007678 chunks. */
7679 if (len > INT_MAX/2) {
7680 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007681 done = 0;
7682 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007683 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007684#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007685 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007686 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007687 done = 1;
7688 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007689
Victor Stinner76a31a62011-11-04 00:05:13 +01007690 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007691 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007692 errors);
7693 if (ret == -2)
7694 ret = encode_code_page_errors(code_page, &outbytes,
7695 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007696 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007697 if (ret < 0) {
7698 Py_XDECREF(outbytes);
7699 return NULL;
7700 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007701
Victor Stinner7581cef2011-11-03 22:32:33 +01007702 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007703 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007704 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007705
Victor Stinner3a50e702011-10-18 21:21:00 +02007706 return outbytes;
7707}
7708
7709PyObject *
7710PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7711 Py_ssize_t size,
7712 const char *errors)
7713{
Victor Stinner7581cef2011-11-03 22:32:33 +01007714 PyObject *unicode, *res;
7715 unicode = PyUnicode_FromUnicode(p, size);
7716 if (unicode == NULL)
7717 return NULL;
7718 res = encode_code_page(CP_ACP, unicode, errors);
7719 Py_DECREF(unicode);
7720 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007721}
7722
7723PyObject *
7724PyUnicode_EncodeCodePage(int code_page,
7725 PyObject *unicode,
7726 const char *errors)
7727{
Victor Stinner7581cef2011-11-03 22:32:33 +01007728 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007729}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007730
Alexander Belopolsky40018472011-02-26 01:02:56 +00007731PyObject *
7732PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007733{
7734 if (!PyUnicode_Check(unicode)) {
7735 PyErr_BadArgument();
7736 return NULL;
7737 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007738 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007739}
7740
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007741#undef NEED_RETRY
7742
Victor Stinner99b95382011-07-04 14:23:54 +02007743#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007744
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745/* --- Character Mapping Codec -------------------------------------------- */
7746
Alexander Belopolsky40018472011-02-26 01:02:56 +00007747PyObject *
7748PyUnicode_DecodeCharmap(const char *s,
7749 Py_ssize_t size,
7750 PyObject *mapping,
7751 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007753 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007754 Py_ssize_t startinpos;
7755 Py_ssize_t endinpos;
7756 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007757 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007758 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007759 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007760 PyObject *errorHandler = NULL;
7761 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007762
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 /* Default to Latin-1 */
7764 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007767 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007771 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007772 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007773 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007774 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007775 Py_ssize_t maplen;
7776 enum PyUnicode_Kind kind;
7777 void *data;
7778 Py_UCS4 x;
7779
Benjamin Petersonbac79492012-01-14 13:34:47 -05007780 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007781 return NULL;
7782
7783 maplen = PyUnicode_GET_LENGTH(mapping);
7784 data = PyUnicode_DATA(mapping);
7785 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 while (s < e) {
7787 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007790 x = PyUnicode_READ(kind, data, ch);
7791 else
7792 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007794 if (x == 0xfffe)
7795 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 startinpos = s-starts;
7798 endinpos = startinpos+1;
7799 if (unicode_decode_call_errorhandler(
7800 errors, &errorHandler,
7801 "charmap", "character maps to <undefined>",
7802 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007803 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 goto onError;
7805 }
7806 continue;
7807 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007808
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007809 if (unicode_putchar(&v, &outpos, x) < 0)
7810 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007812 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007813 }
7814 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 while (s < e) {
7816 unsigned char ch = *s;
7817 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007818
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7820 w = PyLong_FromLong((long)ch);
7821 if (w == NULL)
7822 goto onError;
7823 x = PyObject_GetItem(mapping, w);
7824 Py_DECREF(w);
7825 if (x == NULL) {
7826 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7827 /* No mapping found means: mapping is undefined. */
7828 PyErr_Clear();
7829 x = Py_None;
7830 Py_INCREF(x);
7831 } else
7832 goto onError;
7833 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007834
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 /* Apply mapping */
7836 if (PyLong_Check(x)) {
7837 long value = PyLong_AS_LONG(x);
7838 if (value < 0 || value > 65535) {
7839 PyErr_SetString(PyExc_TypeError,
7840 "character mapping must be in range(65536)");
7841 Py_DECREF(x);
7842 goto onError;
7843 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007844 if (unicode_putchar(&v, &outpos, value) < 0)
7845 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 }
7847 else if (x == Py_None) {
7848 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 startinpos = s-starts;
7850 endinpos = startinpos+1;
7851 if (unicode_decode_call_errorhandler(
7852 errors, &errorHandler,
7853 "charmap", "character maps to <undefined>",
7854 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007855 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 Py_DECREF(x);
7857 goto onError;
7858 }
7859 Py_DECREF(x);
7860 continue;
7861 }
7862 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007863 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007864
Benjamin Petersonbac79492012-01-14 13:34:47 -05007865 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007866 goto onError;
7867 targetsize = PyUnicode_GET_LENGTH(x);
7868
7869 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007871 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007872 PyUnicode_READ_CHAR(x, 0)) < 0)
7873 goto onError;
7874 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 else if (targetsize > 1) {
7876 /* 1-n mapping */
7877 if (targetsize > extrachars) {
7878 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007879 Py_ssize_t needed = (targetsize - extrachars) + \
7880 (targetsize << 2);
7881 extrachars += needed;
7882 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007883 if (unicode_resize(&v,
7884 PyUnicode_GET_LENGTH(v) + needed) < 0)
7885 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 Py_DECREF(x);
7887 goto onError;
7888 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007890 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7891 goto onError;
7892 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7893 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 extrachars -= targetsize;
7895 }
7896 /* 1-0 mapping: skip the character */
7897 }
7898 else {
7899 /* wrong return value */
7900 PyErr_SetString(PyExc_TypeError,
7901 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007902 Py_DECREF(x);
7903 goto onError;
7904 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 Py_DECREF(x);
7906 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007909 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007910 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007911 Py_XDECREF(errorHandler);
7912 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007913 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007914
Benjamin Peterson29060642009-01-31 22:14:21 +00007915 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007916 Py_XDECREF(errorHandler);
7917 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918 Py_XDECREF(v);
7919 return NULL;
7920}
7921
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922/* Charmap encoding: the lookup table */
7923
Alexander Belopolsky40018472011-02-26 01:02:56 +00007924struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 PyObject_HEAD
7926 unsigned char level1[32];
7927 int count2, count3;
7928 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007929};
7930
7931static PyObject*
7932encoding_map_size(PyObject *obj, PyObject* args)
7933{
7934 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007935 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007937}
7938
7939static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007940 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 PyDoc_STR("Return the size (in bytes) of this object") },
7942 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007943};
7944
7945static void
7946encoding_map_dealloc(PyObject* o)
7947{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007948 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007949}
7950
7951static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007952 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 "EncodingMap", /*tp_name*/
7954 sizeof(struct encoding_map), /*tp_basicsize*/
7955 0, /*tp_itemsize*/
7956 /* methods */
7957 encoding_map_dealloc, /*tp_dealloc*/
7958 0, /*tp_print*/
7959 0, /*tp_getattr*/
7960 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007961 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 0, /*tp_repr*/
7963 0, /*tp_as_number*/
7964 0, /*tp_as_sequence*/
7965 0, /*tp_as_mapping*/
7966 0, /*tp_hash*/
7967 0, /*tp_call*/
7968 0, /*tp_str*/
7969 0, /*tp_getattro*/
7970 0, /*tp_setattro*/
7971 0, /*tp_as_buffer*/
7972 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7973 0, /*tp_doc*/
7974 0, /*tp_traverse*/
7975 0, /*tp_clear*/
7976 0, /*tp_richcompare*/
7977 0, /*tp_weaklistoffset*/
7978 0, /*tp_iter*/
7979 0, /*tp_iternext*/
7980 encoding_map_methods, /*tp_methods*/
7981 0, /*tp_members*/
7982 0, /*tp_getset*/
7983 0, /*tp_base*/
7984 0, /*tp_dict*/
7985 0, /*tp_descr_get*/
7986 0, /*tp_descr_set*/
7987 0, /*tp_dictoffset*/
7988 0, /*tp_init*/
7989 0, /*tp_alloc*/
7990 0, /*tp_new*/
7991 0, /*tp_free*/
7992 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007993};
7994
7995PyObject*
7996PyUnicode_BuildEncodingMap(PyObject* string)
7997{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007998 PyObject *result;
7999 struct encoding_map *mresult;
8000 int i;
8001 int need_dict = 0;
8002 unsigned char level1[32];
8003 unsigned char level2[512];
8004 unsigned char *mlevel1, *mlevel2, *mlevel3;
8005 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008006 int kind;
8007 void *data;
8008 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008010 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008011 PyErr_BadArgument();
8012 return NULL;
8013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008014 kind = PyUnicode_KIND(string);
8015 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008016 memset(level1, 0xFF, sizeof level1);
8017 memset(level2, 0xFF, sizeof level2);
8018
8019 /* If there isn't a one-to-one mapping of NULL to \0,
8020 or if there are non-BMP characters, we need to use
8021 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008022 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008023 need_dict = 1;
8024 for (i = 1; i < 256; i++) {
8025 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 ch = PyUnicode_READ(kind, data, i);
8027 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008028 need_dict = 1;
8029 break;
8030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008031 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008032 /* unmapped character */
8033 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008034 l1 = ch >> 11;
8035 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008036 if (level1[l1] == 0xFF)
8037 level1[l1] = count2++;
8038 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008039 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008040 }
8041
8042 if (count2 >= 0xFF || count3 >= 0xFF)
8043 need_dict = 1;
8044
8045 if (need_dict) {
8046 PyObject *result = PyDict_New();
8047 PyObject *key, *value;
8048 if (!result)
8049 return NULL;
8050 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008051 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008052 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053 if (!key || !value)
8054 goto failed1;
8055 if (PyDict_SetItem(result, key, value) == -1)
8056 goto failed1;
8057 Py_DECREF(key);
8058 Py_DECREF(value);
8059 }
8060 return result;
8061 failed1:
8062 Py_XDECREF(key);
8063 Py_XDECREF(value);
8064 Py_DECREF(result);
8065 return NULL;
8066 }
8067
8068 /* Create a three-level trie */
8069 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8070 16*count2 + 128*count3 - 1);
8071 if (!result)
8072 return PyErr_NoMemory();
8073 PyObject_Init(result, &EncodingMapType);
8074 mresult = (struct encoding_map*)result;
8075 mresult->count2 = count2;
8076 mresult->count3 = count3;
8077 mlevel1 = mresult->level1;
8078 mlevel2 = mresult->level23;
8079 mlevel3 = mresult->level23 + 16*count2;
8080 memcpy(mlevel1, level1, 32);
8081 memset(mlevel2, 0xFF, 16*count2);
8082 memset(mlevel3, 0, 128*count3);
8083 count3 = 0;
8084 for (i = 1; i < 256; i++) {
8085 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008086 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008087 /* unmapped character */
8088 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008089 o1 = PyUnicode_READ(kind, data, i)>>11;
8090 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008091 i2 = 16*mlevel1[o1] + o2;
8092 if (mlevel2[i2] == 0xFF)
8093 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008094 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008095 i3 = 128*mlevel2[i2] + o3;
8096 mlevel3[i3] = i;
8097 }
8098 return result;
8099}
8100
8101static int
Victor Stinner22168992011-11-20 17:09:18 +01008102encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008103{
8104 struct encoding_map *map = (struct encoding_map*)mapping;
8105 int l1 = c>>11;
8106 int l2 = (c>>7) & 0xF;
8107 int l3 = c & 0x7F;
8108 int i;
8109
Victor Stinner22168992011-11-20 17:09:18 +01008110 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008112 if (c == 0)
8113 return 0;
8114 /* level 1*/
8115 i = map->level1[l1];
8116 if (i == 0xFF) {
8117 return -1;
8118 }
8119 /* level 2*/
8120 i = map->level23[16*i+l2];
8121 if (i == 0xFF) {
8122 return -1;
8123 }
8124 /* level 3 */
8125 i = map->level23[16*map->count2 + 128*i + l3];
8126 if (i == 0) {
8127 return -1;
8128 }
8129 return i;
8130}
8131
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008132/* Lookup the character ch in the mapping. If the character
8133 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008134 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008135static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008136charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137{
Christian Heimes217cfd12007-12-02 14:31:20 +00008138 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008139 PyObject *x;
8140
8141 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008143 x = PyObject_GetItem(mapping, w);
8144 Py_DECREF(w);
8145 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8147 /* No mapping found means: mapping is undefined. */
8148 PyErr_Clear();
8149 x = Py_None;
8150 Py_INCREF(x);
8151 return x;
8152 } else
8153 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008155 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008157 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 long value = PyLong_AS_LONG(x);
8159 if (value < 0 || value > 255) {
8160 PyErr_SetString(PyExc_TypeError,
8161 "character mapping must be in range(256)");
8162 Py_DECREF(x);
8163 return NULL;
8164 }
8165 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008167 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 /* wrong return value */
8171 PyErr_Format(PyExc_TypeError,
8172 "character mapping must return integer, bytes or None, not %.400s",
8173 x->ob_type->tp_name);
8174 Py_DECREF(x);
8175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176 }
8177}
8178
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008179static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008180charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008181{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008182 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8183 /* exponentially overallocate to minimize reallocations */
8184 if (requiredsize < 2*outsize)
8185 requiredsize = 2*outsize;
8186 if (_PyBytes_Resize(outobj, requiredsize))
8187 return -1;
8188 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008189}
8190
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008193} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008195 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008196 space is available. Return a new reference to the object that
8197 was put in the output buffer, or Py_None, if the mapping was undefined
8198 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008199 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008200static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008201charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008202 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008203{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008204 PyObject *rep;
8205 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008206 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008207
Christian Heimes90aa7642007-12-19 02:45:37 +00008208 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008209 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008211 if (res == -1)
8212 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008213 if (outsize<requiredsize)
8214 if (charmapencode_resize(outobj, outpos, requiredsize))
8215 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008216 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 outstart[(*outpos)++] = (char)res;
8218 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008219 }
8220
8221 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008222 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008224 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 Py_DECREF(rep);
8226 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008227 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 if (PyLong_Check(rep)) {
8229 Py_ssize_t requiredsize = *outpos+1;
8230 if (outsize<requiredsize)
8231 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8232 Py_DECREF(rep);
8233 return enc_EXCEPTION;
8234 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008235 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008237 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 else {
8239 const char *repchars = PyBytes_AS_STRING(rep);
8240 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8241 Py_ssize_t requiredsize = *outpos+repsize;
8242 if (outsize<requiredsize)
8243 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8244 Py_DECREF(rep);
8245 return enc_EXCEPTION;
8246 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008247 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 memcpy(outstart + *outpos, repchars, repsize);
8249 *outpos += repsize;
8250 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008252 Py_DECREF(rep);
8253 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254}
8255
8256/* handle an error in PyUnicode_EncodeCharmap
8257 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008258static int
8259charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008260 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008261 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008262 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008263 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264{
8265 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008266 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008267 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008268 enum PyUnicode_Kind kind;
8269 void *data;
8270 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008272 Py_ssize_t collstartpos = *inpos;
8273 Py_ssize_t collendpos = *inpos+1;
8274 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275 char *encoding = "charmap";
8276 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008277 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008278 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008279 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280
Benjamin Petersonbac79492012-01-14 13:34:47 -05008281 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008282 return -1;
8283 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008284 /* find all unencodable characters */
8285 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008286 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008287 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008288 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008289 val = encoding_map_lookup(ch, mapping);
8290 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 break;
8292 ++collendpos;
8293 continue;
8294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008295
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008296 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8297 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 if (rep==NULL)
8299 return -1;
8300 else if (rep!=Py_None) {
8301 Py_DECREF(rep);
8302 break;
8303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008304 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 }
8307 /* cache callback name lookup
8308 * (if not done yet, i.e. it's the first error) */
8309 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 if ((errors==NULL) || (!strcmp(errors, "strict")))
8311 *known_errorHandler = 1;
8312 else if (!strcmp(errors, "replace"))
8313 *known_errorHandler = 2;
8314 else if (!strcmp(errors, "ignore"))
8315 *known_errorHandler = 3;
8316 else if (!strcmp(errors, "xmlcharrefreplace"))
8317 *known_errorHandler = 4;
8318 else
8319 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320 }
8321 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008322 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008323 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008324 return -1;
8325 case 2: /* replace */
8326 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 x = charmapencode_output('?', mapping, res, respos);
8328 if (x==enc_EXCEPTION) {
8329 return -1;
8330 }
8331 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008332 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 return -1;
8334 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008335 }
8336 /* fall through */
8337 case 3: /* ignore */
8338 *inpos = collendpos;
8339 break;
8340 case 4: /* xmlcharrefreplace */
8341 /* generate replacement (temporarily (mis)uses p) */
8342 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 char buffer[2+29+1+1];
8344 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008345 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 for (cp = buffer; *cp; ++cp) {
8347 x = charmapencode_output(*cp, mapping, res, respos);
8348 if (x==enc_EXCEPTION)
8349 return -1;
8350 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008351 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 return -1;
8353 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008354 }
8355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008356 *inpos = collendpos;
8357 break;
8358 default:
8359 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008360 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008362 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008364 if (PyBytes_Check(repunicode)) {
8365 /* Directly copy bytes result to output. */
8366 Py_ssize_t outsize = PyBytes_Size(*res);
8367 Py_ssize_t requiredsize;
8368 repsize = PyBytes_Size(repunicode);
8369 requiredsize = *respos + repsize;
8370 if (requiredsize > outsize)
8371 /* Make room for all additional bytes. */
8372 if (charmapencode_resize(res, respos, requiredsize)) {
8373 Py_DECREF(repunicode);
8374 return -1;
8375 }
8376 memcpy(PyBytes_AsString(*res) + *respos,
8377 PyBytes_AsString(repunicode), repsize);
8378 *respos += repsize;
8379 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008380 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008381 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008382 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008383 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008384 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008385 Py_DECREF(repunicode);
8386 return -1;
8387 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008388 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008389 data = PyUnicode_DATA(repunicode);
8390 kind = PyUnicode_KIND(repunicode);
8391 for (index = 0; index < repsize; index++) {
8392 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8393 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008395 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 return -1;
8397 }
8398 else if (x==enc_FAILED) {
8399 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008400 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 return -1;
8402 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008403 }
8404 *inpos = newpos;
8405 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 }
8407 return 0;
8408}
8409
Alexander Belopolsky40018472011-02-26 01:02:56 +00008410PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008411_PyUnicode_EncodeCharmap(PyObject *unicode,
8412 PyObject *mapping,
8413 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 /* output object */
8416 PyObject *res = NULL;
8417 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008418 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008419 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008421 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 PyObject *errorHandler = NULL;
8423 PyObject *exc = NULL;
8424 /* the following variable is used for caching string comparisons
8425 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8426 * 3=ignore, 4=xmlcharrefreplace */
8427 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428
Benjamin Petersonbac79492012-01-14 13:34:47 -05008429 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008430 return NULL;
8431 size = PyUnicode_GET_LENGTH(unicode);
8432
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 /* Default to Latin-1 */
8434 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008435 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437 /* allocate enough for a simple encoding without
8438 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008439 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008440 if (res == NULL)
8441 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008442 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008445 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008446 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008448 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 if (x==enc_EXCEPTION) /* error */
8450 goto onError;
8451 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008452 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 &exc,
8454 &known_errorHandler, &errorHandler, errors,
8455 &res, &respos)) {
8456 goto onError;
8457 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 else
8460 /* done with this character => adjust input position */
8461 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008465 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008466 if (_PyBytes_Resize(&res, respos) < 0)
8467 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008468
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469 Py_XDECREF(exc);
8470 Py_XDECREF(errorHandler);
8471 return res;
8472
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474 Py_XDECREF(res);
8475 Py_XDECREF(exc);
8476 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 return NULL;
8478}
8479
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008480/* Deprecated */
8481PyObject *
8482PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8483 Py_ssize_t size,
8484 PyObject *mapping,
8485 const char *errors)
8486{
8487 PyObject *result;
8488 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8489 if (unicode == NULL)
8490 return NULL;
8491 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8492 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008493 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008494}
8495
Alexander Belopolsky40018472011-02-26 01:02:56 +00008496PyObject *
8497PyUnicode_AsCharmapString(PyObject *unicode,
8498 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499{
8500 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 PyErr_BadArgument();
8502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008504 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505}
8506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008508static void
8509make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008511 Py_ssize_t startpos, Py_ssize_t endpos,
8512 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008514 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 *exceptionObject = _PyUnicodeTranslateError_Create(
8516 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 }
8518 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8520 goto onError;
8521 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8522 goto onError;
8523 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8524 goto onError;
8525 return;
8526 onError:
8527 Py_DECREF(*exceptionObject);
8528 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529 }
8530}
8531
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008533static void
8534raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008536 Py_ssize_t startpos, Py_ssize_t endpos,
8537 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538{
8539 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543}
8544
8545/* error handling callback helper:
8546 build arguments, call the callback and check the arguments,
8547 put the result into newpos and return the replacement string, which
8548 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008549static PyObject *
8550unicode_translate_call_errorhandler(const char *errors,
8551 PyObject **errorHandler,
8552 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008554 Py_ssize_t startpos, Py_ssize_t endpos,
8555 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008557 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008559 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 PyObject *restuple;
8561 PyObject *resunicode;
8562
8563 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 }
8568
8569 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008571 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573
8574 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008579 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 Py_DECREF(restuple);
8581 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 }
8583 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 &resunicode, &i_newpos)) {
8585 Py_DECREF(restuple);
8586 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008588 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008590 else
8591 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8594 Py_DECREF(restuple);
8595 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008596 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597 Py_INCREF(resunicode);
8598 Py_DECREF(restuple);
8599 return resunicode;
8600}
8601
8602/* Lookup the character ch in the mapping and put the result in result,
8603 which must be decrefed by the caller.
8604 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008605static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008607{
Christian Heimes217cfd12007-12-02 14:31:20 +00008608 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008609 PyObject *x;
8610
8611 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613 x = PyObject_GetItem(mapping, w);
8614 Py_DECREF(w);
8615 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8617 /* No mapping found means: use 1:1 mapping. */
8618 PyErr_Clear();
8619 *result = NULL;
8620 return 0;
8621 } else
8622 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 }
8624 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 *result = x;
8626 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008627 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008628 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 long value = PyLong_AS_LONG(x);
8630 long max = PyUnicode_GetMax();
8631 if (value < 0 || value > max) {
8632 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008633 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 Py_DECREF(x);
8635 return -1;
8636 }
8637 *result = x;
8638 return 0;
8639 }
8640 else if (PyUnicode_Check(x)) {
8641 *result = x;
8642 return 0;
8643 }
8644 else {
8645 /* wrong return value */
8646 PyErr_SetString(PyExc_TypeError,
8647 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008648 Py_DECREF(x);
8649 return -1;
8650 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008651}
8652/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 if not reallocate and adjust various state variables.
8654 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008655static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008658{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008660 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 /* exponentially overallocate to minimize reallocations */
8662 if (requiredsize < 2 * oldsize)
8663 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8665 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 }
8669 return 0;
8670}
8671/* lookup the character, put the result in the output string and adjust
8672 various state variables. Return a new reference to the object that
8673 was put in the output buffer in *result, or Py_None, if the mapping was
8674 undefined (in which case no character was written).
8675 The called must decref result.
8676 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008677static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8679 PyObject *mapping, Py_UCS4 **output,
8680 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008681 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008683 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8684 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 }
8690 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008692 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008694 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695 }
8696 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 Py_ssize_t repsize;
8698 if (PyUnicode_READY(*res) == -1)
8699 return -1;
8700 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 if (repsize==1) {
8702 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 }
8705 else if (repsize!=0) {
8706 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 Py_ssize_t requiredsize = *opos +
8708 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 Py_ssize_t i;
8711 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 for(i = 0; i < repsize; i++)
8714 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 }
8717 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719 return 0;
8720}
8721
Alexander Belopolsky40018472011-02-26 01:02:56 +00008722PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723_PyUnicode_TranslateCharmap(PyObject *input,
8724 PyObject *mapping,
8725 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 /* input object */
8728 char *idata;
8729 Py_ssize_t size, i;
8730 int kind;
8731 /* output buffer */
8732 Py_UCS4 *output = NULL;
8733 Py_ssize_t osize;
8734 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008736 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737 char *reason = "character maps to <undefined>";
8738 PyObject *errorHandler = NULL;
8739 PyObject *exc = NULL;
8740 /* the following variable is used for caching string comparisons
8741 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8742 * 3=ignore, 4=xmlcharrefreplace */
8743 int known_errorHandler = -1;
8744
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 PyErr_BadArgument();
8747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 if (PyUnicode_READY(input) == -1)
8751 return NULL;
8752 idata = (char*)PyUnicode_DATA(input);
8753 kind = PyUnicode_KIND(input);
8754 size = PyUnicode_GET_LENGTH(input);
8755 i = 0;
8756
8757 if (size == 0) {
8758 Py_INCREF(input);
8759 return input;
8760 }
8761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008762 /* allocate enough for a simple 1:1 translation without
8763 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 osize = size;
8765 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8766 opos = 0;
8767 if (output == NULL) {
8768 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 /* try to encode it */
8774 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 if (charmaptranslate_output(input, i, mapping,
8776 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 Py_XDECREF(x);
8778 goto onError;
8779 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008780 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 else { /* untranslatable character */
8784 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8785 Py_ssize_t repsize;
8786 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 Py_ssize_t collstart = i;
8790 Py_ssize_t collend = i+1;
8791 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 while (collend < size) {
8795 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 goto onError;
8797 Py_XDECREF(x);
8798 if (x!=Py_None)
8799 break;
8800 ++collend;
8801 }
8802 /* cache callback name lookup
8803 * (if not done yet, i.e. it's the first error) */
8804 if (known_errorHandler==-1) {
8805 if ((errors==NULL) || (!strcmp(errors, "strict")))
8806 known_errorHandler = 1;
8807 else if (!strcmp(errors, "replace"))
8808 known_errorHandler = 2;
8809 else if (!strcmp(errors, "ignore"))
8810 known_errorHandler = 3;
8811 else if (!strcmp(errors, "xmlcharrefreplace"))
8812 known_errorHandler = 4;
8813 else
8814 known_errorHandler = 0;
8815 }
8816 switch (known_errorHandler) {
8817 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008818 raise_translate_exception(&exc, input, collstart,
8819 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008820 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 case 2: /* replace */
8822 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823 for (coll = collstart; coll<collend; coll++)
8824 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 /* fall through */
8826 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 break;
8829 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830 /* generate replacement (temporarily (mis)uses i) */
8831 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 char buffer[2+29+1+1];
8833 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8835 if (charmaptranslate_makespace(&output, &osize,
8836 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 goto onError;
8838 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 break;
8843 default:
8844 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 reason, input, &exc,
8846 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008847 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008848 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008849 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008850 Py_DECREF(repunicode);
8851 goto onError;
8852 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 repsize = PyUnicode_GET_LENGTH(repunicode);
8855 if (charmaptranslate_makespace(&output, &osize,
8856 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008857 Py_DECREF(repunicode);
8858 goto onError;
8859 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860 for (uni2 = 0; repsize-->0; ++uni2)
8861 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8862 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008864 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008865 }
8866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8868 if (!res)
8869 goto onError;
8870 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008871 Py_XDECREF(exc);
8872 Py_XDECREF(errorHandler);
8873 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008877 Py_XDECREF(exc);
8878 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 return NULL;
8880}
8881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882/* Deprecated. Use PyUnicode_Translate instead. */
8883PyObject *
8884PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8885 Py_ssize_t size,
8886 PyObject *mapping,
8887 const char *errors)
8888{
8889 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8890 if (!unicode)
8891 return NULL;
8892 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8893}
8894
Alexander Belopolsky40018472011-02-26 01:02:56 +00008895PyObject *
8896PyUnicode_Translate(PyObject *str,
8897 PyObject *mapping,
8898 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899{
8900 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008901
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902 str = PyUnicode_FromObject(str);
8903 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008906 Py_DECREF(str);
8907 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008908
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910 Py_XDECREF(str);
8911 return NULL;
8912}
Tim Petersced69f82003-09-16 20:30:58 +00008913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008915fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916{
8917 /* No need to call PyUnicode_READY(self) because this function is only
8918 called as a callback from fixup() which does it already. */
8919 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8920 const int kind = PyUnicode_KIND(self);
8921 void *data = PyUnicode_DATA(self);
8922 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008923 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 Py_ssize_t i;
8925
8926 for (i = 0; i < len; ++i) {
8927 ch = PyUnicode_READ(kind, data, i);
8928 fixed = 0;
8929 if (ch > 127) {
8930 if (Py_UNICODE_ISSPACE(ch))
8931 fixed = ' ';
8932 else {
8933 const int decimal = Py_UNICODE_TODECIMAL(ch);
8934 if (decimal >= 0)
8935 fixed = '0' + decimal;
8936 }
8937 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008938 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939 if (fixed > maxchar)
8940 maxchar = fixed;
8941 PyUnicode_WRITE(kind, data, i, fixed);
8942 }
8943 else if (ch > maxchar)
8944 maxchar = ch;
8945 }
8946 else if (ch > maxchar)
8947 maxchar = ch;
8948 }
8949
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008950 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951}
8952
8953PyObject *
8954_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8955{
8956 if (!PyUnicode_Check(unicode)) {
8957 PyErr_BadInternalCall();
8958 return NULL;
8959 }
8960 if (PyUnicode_READY(unicode) == -1)
8961 return NULL;
8962 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8963 /* If the string is already ASCII, just return the same string */
8964 Py_INCREF(unicode);
8965 return unicode;
8966 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008967 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968}
8969
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008970PyObject *
8971PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8972 Py_ssize_t length)
8973{
Victor Stinnerf0124502011-11-21 23:12:56 +01008974 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008975 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008976 Py_UCS4 maxchar;
8977 enum PyUnicode_Kind kind;
8978 void *data;
8979
Victor Stinner99d7ad02012-02-22 13:37:39 +01008980 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008981 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008982 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008983 if (ch > 127) {
8984 int decimal = Py_UNICODE_TODECIMAL(ch);
8985 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008986 ch = '0' + decimal;
Victor Stinner99d7ad02012-02-22 13:37:39 +01008987 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008988 }
8989 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008990
8991 /* Copy to a new string */
8992 decimal = PyUnicode_New(length, maxchar);
8993 if (decimal == NULL)
8994 return decimal;
8995 kind = PyUnicode_KIND(decimal);
8996 data = PyUnicode_DATA(decimal);
8997 /* Iterate over code points */
8998 for (i = 0; i < length; i++) {
8999 Py_UNICODE ch = s[i];
9000 if (ch > 127) {
9001 int decimal = Py_UNICODE_TODECIMAL(ch);
9002 if (decimal >= 0)
9003 ch = '0' + decimal;
9004 }
9005 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009007 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009008}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009009/* --- Decimal Encoder ---------------------------------------------------- */
9010
Alexander Belopolsky40018472011-02-26 01:02:56 +00009011int
9012PyUnicode_EncodeDecimal(Py_UNICODE *s,
9013 Py_ssize_t length,
9014 char *output,
9015 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009016{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009017 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009018 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009019 enum PyUnicode_Kind kind;
9020 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009021
9022 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 PyErr_BadArgument();
9024 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009025 }
9026
Victor Stinner42bf7752011-11-21 22:52:58 +01009027 unicode = PyUnicode_FromUnicode(s, length);
9028 if (unicode == NULL)
9029 return -1;
9030
Benjamin Petersonbac79492012-01-14 13:34:47 -05009031 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009032 Py_DECREF(unicode);
9033 return -1;
9034 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009035 kind = PyUnicode_KIND(unicode);
9036 data = PyUnicode_DATA(unicode);
9037
Victor Stinnerb84d7232011-11-22 01:50:07 +01009038 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009039 PyObject *exc;
9040 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009041 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009042 Py_ssize_t startpos;
9043
9044 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009045
Benjamin Peterson29060642009-01-31 22:14:21 +00009046 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009047 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009048 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009049 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009050 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009051 decimal = Py_UNICODE_TODECIMAL(ch);
9052 if (decimal >= 0) {
9053 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009054 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009055 continue;
9056 }
9057 if (0 < ch && ch < 256) {
9058 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009059 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009060 continue;
9061 }
Victor Stinner6345be92011-11-25 20:09:01 +01009062
Victor Stinner42bf7752011-11-21 22:52:58 +01009063 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009064 exc = NULL;
9065 raise_encode_exception(&exc, "decimal", unicode,
9066 startpos, startpos+1,
9067 "invalid decimal Unicode string");
9068 Py_XDECREF(exc);
9069 Py_DECREF(unicode);
9070 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009071 }
9072 /* 0-terminate the output string */
9073 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009074 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009075 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009076}
9077
Guido van Rossumd57fd912000-03-10 22:53:23 +00009078/* --- Helpers ------------------------------------------------------------ */
9079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009081any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 Py_ssize_t start,
9083 Py_ssize_t end)
9084{
9085 int kind1, kind2, kind;
9086 void *buf1, *buf2;
9087 Py_ssize_t len1, len2, result;
9088
9089 kind1 = PyUnicode_KIND(s1);
9090 kind2 = PyUnicode_KIND(s2);
9091 kind = kind1 > kind2 ? kind1 : kind2;
9092 buf1 = PyUnicode_DATA(s1);
9093 buf2 = PyUnicode_DATA(s2);
9094 if (kind1 != kind)
9095 buf1 = _PyUnicode_AsKind(s1, kind);
9096 if (!buf1)
9097 return -2;
9098 if (kind2 != kind)
9099 buf2 = _PyUnicode_AsKind(s2, kind);
9100 if (!buf2) {
9101 if (kind1 != kind) PyMem_Free(buf1);
9102 return -2;
9103 }
9104 len1 = PyUnicode_GET_LENGTH(s1);
9105 len2 = PyUnicode_GET_LENGTH(s2);
9106
Victor Stinner794d5672011-10-10 03:21:36 +02009107 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009108 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009109 case PyUnicode_1BYTE_KIND:
9110 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9111 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9112 else
9113 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9114 break;
9115 case PyUnicode_2BYTE_KIND:
9116 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9117 break;
9118 case PyUnicode_4BYTE_KIND:
9119 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9120 break;
9121 default:
9122 assert(0); result = -2;
9123 }
9124 }
9125 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009126 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009127 case PyUnicode_1BYTE_KIND:
9128 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9129 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9130 else
9131 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9132 break;
9133 case PyUnicode_2BYTE_KIND:
9134 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9135 break;
9136 case PyUnicode_4BYTE_KIND:
9137 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9138 break;
9139 default:
9140 assert(0); result = -2;
9141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 }
9143
9144 if (kind1 != kind)
9145 PyMem_Free(buf1);
9146 if (kind2 != kind)
9147 PyMem_Free(buf2);
9148
9149 return result;
9150}
9151
9152Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009153_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 Py_ssize_t n_buffer,
9155 void *digits, Py_ssize_t n_digits,
9156 Py_ssize_t min_width,
9157 const char *grouping,
9158 const char *thousands_sep)
9159{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009160 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009162 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9163 return _PyUnicode_ascii_InsertThousandsGrouping(
9164 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9165 min_width, grouping, thousands_sep);
9166 else
9167 return _PyUnicode_ucs1_InsertThousandsGrouping(
9168 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9169 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170 case PyUnicode_2BYTE_KIND:
9171 return _PyUnicode_ucs2_InsertThousandsGrouping(
9172 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9173 min_width, grouping, thousands_sep);
9174 case PyUnicode_4BYTE_KIND:
9175 return _PyUnicode_ucs4_InsertThousandsGrouping(
9176 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9177 min_width, grouping, thousands_sep);
9178 }
9179 assert(0);
9180 return -1;
9181}
9182
9183
Thomas Wouters477c8d52006-05-27 19:21:47 +00009184/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009185#define ADJUST_INDICES(start, end, len) \
9186 if (end > len) \
9187 end = len; \
9188 else if (end < 0) { \
9189 end += len; \
9190 if (end < 0) \
9191 end = 0; \
9192 } \
9193 if (start < 0) { \
9194 start += len; \
9195 if (start < 0) \
9196 start = 0; \
9197 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009198
Alexander Belopolsky40018472011-02-26 01:02:56 +00009199Py_ssize_t
9200PyUnicode_Count(PyObject *str,
9201 PyObject *substr,
9202 Py_ssize_t start,
9203 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009205 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009206 PyObject* str_obj;
9207 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009208 int kind1, kind2, kind;
9209 void *buf1 = NULL, *buf2 = NULL;
9210 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009211
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009212 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009213 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009214 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009215 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009216 if (!sub_obj) {
9217 Py_DECREF(str_obj);
9218 return -1;
9219 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009220 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009221 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009222 Py_DECREF(str_obj);
9223 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224 }
Tim Petersced69f82003-09-16 20:30:58 +00009225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226 kind1 = PyUnicode_KIND(str_obj);
9227 kind2 = PyUnicode_KIND(sub_obj);
9228 kind = kind1 > kind2 ? kind1 : kind2;
9229 buf1 = PyUnicode_DATA(str_obj);
9230 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009231 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009232 if (!buf1)
9233 goto onError;
9234 buf2 = PyUnicode_DATA(sub_obj);
9235 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009236 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 if (!buf2)
9238 goto onError;
9239 len1 = PyUnicode_GET_LENGTH(str_obj);
9240 len2 = PyUnicode_GET_LENGTH(sub_obj);
9241
9242 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009243 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009245 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9246 result = asciilib_count(
9247 ((Py_UCS1*)buf1) + start, end - start,
9248 buf2, len2, PY_SSIZE_T_MAX
9249 );
9250 else
9251 result = ucs1lib_count(
9252 ((Py_UCS1*)buf1) + start, end - start,
9253 buf2, len2, PY_SSIZE_T_MAX
9254 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 break;
9256 case PyUnicode_2BYTE_KIND:
9257 result = ucs2lib_count(
9258 ((Py_UCS2*)buf1) + start, end - start,
9259 buf2, len2, PY_SSIZE_T_MAX
9260 );
9261 break;
9262 case PyUnicode_4BYTE_KIND:
9263 result = ucs4lib_count(
9264 ((Py_UCS4*)buf1) + start, end - start,
9265 buf2, len2, PY_SSIZE_T_MAX
9266 );
9267 break;
9268 default:
9269 assert(0); result = 0;
9270 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009271
9272 Py_DECREF(sub_obj);
9273 Py_DECREF(str_obj);
9274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 if (kind1 != kind)
9276 PyMem_Free(buf1);
9277 if (kind2 != kind)
9278 PyMem_Free(buf2);
9279
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 onError:
9282 Py_DECREF(sub_obj);
9283 Py_DECREF(str_obj);
9284 if (kind1 != kind && buf1)
9285 PyMem_Free(buf1);
9286 if (kind2 != kind && buf2)
9287 PyMem_Free(buf2);
9288 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289}
9290
Alexander Belopolsky40018472011-02-26 01:02:56 +00009291Py_ssize_t
9292PyUnicode_Find(PyObject *str,
9293 PyObject *sub,
9294 Py_ssize_t start,
9295 Py_ssize_t end,
9296 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009298 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009299
Guido van Rossumd57fd912000-03-10 22:53:23 +00009300 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009301 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009302 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009303 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009304 if (!sub) {
9305 Py_DECREF(str);
9306 return -2;
9307 }
9308 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9309 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009310 Py_DECREF(str);
9311 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312 }
Tim Petersced69f82003-09-16 20:30:58 +00009313
Victor Stinner794d5672011-10-10 03:21:36 +02009314 result = any_find_slice(direction,
9315 str, sub, start, end
9316 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009317
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009319 Py_DECREF(sub);
9320
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321 return result;
9322}
9323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324Py_ssize_t
9325PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9326 Py_ssize_t start, Py_ssize_t end,
9327 int direction)
9328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009330 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 if (PyUnicode_READY(str) == -1)
9332 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009333 if (start < 0 || end < 0) {
9334 PyErr_SetString(PyExc_IndexError, "string index out of range");
9335 return -2;
9336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 if (end > PyUnicode_GET_LENGTH(str))
9338 end = PyUnicode_GET_LENGTH(str);
9339 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009340 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9341 kind, end-start, ch, direction);
9342 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009344 else
9345 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346}
9347
Alexander Belopolsky40018472011-02-26 01:02:56 +00009348static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009349tailmatch(PyObject *self,
9350 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009351 Py_ssize_t start,
9352 Py_ssize_t end,
9353 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 int kind_self;
9356 int kind_sub;
9357 void *data_self;
9358 void *data_sub;
9359 Py_ssize_t offset;
9360 Py_ssize_t i;
9361 Py_ssize_t end_sub;
9362
9363 if (PyUnicode_READY(self) == -1 ||
9364 PyUnicode_READY(substring) == -1)
9365 return 0;
9366
9367 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 return 1;
9369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9371 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009373 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375 kind_self = PyUnicode_KIND(self);
9376 data_self = PyUnicode_DATA(self);
9377 kind_sub = PyUnicode_KIND(substring);
9378 data_sub = PyUnicode_DATA(substring);
9379 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9380
9381 if (direction > 0)
9382 offset = end;
9383 else
9384 offset = start;
9385
9386 if (PyUnicode_READ(kind_self, data_self, offset) ==
9387 PyUnicode_READ(kind_sub, data_sub, 0) &&
9388 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9389 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9390 /* If both are of the same kind, memcmp is sufficient */
9391 if (kind_self == kind_sub) {
9392 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009393 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 data_sub,
9395 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009396 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397 }
9398 /* otherwise we have to compare each character by first accesing it */
9399 else {
9400 /* We do not need to compare 0 and len(substring)-1 because
9401 the if statement above ensured already that they are equal
9402 when we end up here. */
9403 // TODO: honor direction and do a forward or backwards search
9404 for (i = 1; i < end_sub; ++i) {
9405 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9406 PyUnicode_READ(kind_sub, data_sub, i))
9407 return 0;
9408 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009409 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009411 }
9412
9413 return 0;
9414}
9415
Alexander Belopolsky40018472011-02-26 01:02:56 +00009416Py_ssize_t
9417PyUnicode_Tailmatch(PyObject *str,
9418 PyObject *substr,
9419 Py_ssize_t start,
9420 Py_ssize_t end,
9421 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009423 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009424
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425 str = PyUnicode_FromObject(str);
9426 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009427 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428 substr = PyUnicode_FromObject(substr);
9429 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009430 Py_DECREF(str);
9431 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432 }
Tim Petersced69f82003-09-16 20:30:58 +00009433
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009434 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009435 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436 Py_DECREF(str);
9437 Py_DECREF(substr);
9438 return result;
9439}
9440
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441/* Apply fixfct filter to the Unicode object self and return a
9442 reference to the modified object */
9443
Alexander Belopolsky40018472011-02-26 01:02:56 +00009444static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009445fixup(PyObject *self,
9446 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 PyObject *u;
9449 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009450 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009452 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009454 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009455 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 /* fix functions return the new maximum character in a string,
9458 if the kind of the resulting unicode object does not change,
9459 everything is fine. Otherwise we need to change the string kind
9460 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009461 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009462
9463 if (maxchar_new == 0) {
9464 /* no changes */;
9465 if (PyUnicode_CheckExact(self)) {
9466 Py_DECREF(u);
9467 Py_INCREF(self);
9468 return self;
9469 }
9470 else
9471 return u;
9472 }
9473
9474 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 maxchar_new = 127;
9476 else if (maxchar_new <= 255)
9477 maxchar_new = 255;
9478 else if (maxchar_new <= 65535)
9479 maxchar_new = 65535;
9480 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009481 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482
Victor Stinnereaab6042011-12-11 22:22:39 +01009483 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009485
9486 /* In case the maximum character changed, we need to
9487 convert the string to the new category. */
9488 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9489 if (v == NULL) {
9490 Py_DECREF(u);
9491 return NULL;
9492 }
9493 if (maxchar_new > maxchar_old) {
9494 /* If the maxchar increased so that the kind changed, not all
9495 characters are representable anymore and we need to fix the
9496 string again. This only happens in very few cases. */
9497 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9498 maxchar_old = fixfct(v);
9499 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 }
9501 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009502 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009504 Py_DECREF(u);
9505 assert(_PyUnicode_CheckConsistency(v, 1));
9506 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507}
9508
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009509static PyObject *
9510ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009512 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9513 char *resdata, *data = PyUnicode_DATA(self);
9514 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009515
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009516 res = PyUnicode_New(len, 127);
9517 if (res == NULL)
9518 return NULL;
9519 resdata = PyUnicode_DATA(res);
9520 if (lower)
9521 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009523 _Py_bytes_upper(resdata, data, len);
9524 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525}
9526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009528handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009530 Py_ssize_t j;
9531 int final_sigma;
9532 Py_UCS4 c;
9533 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009534
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009535 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9536
9537 where ! is a negation and \p{xxx} is a character with property xxx.
9538 */
9539 for (j = i - 1; j >= 0; j--) {
9540 c = PyUnicode_READ(kind, data, j);
9541 if (!_PyUnicode_IsCaseIgnorable(c))
9542 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009544 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9545 if (final_sigma) {
9546 for (j = i + 1; j < length; j++) {
9547 c = PyUnicode_READ(kind, data, j);
9548 if (!_PyUnicode_IsCaseIgnorable(c))
9549 break;
9550 }
9551 final_sigma = j == length || !_PyUnicode_IsCased(c);
9552 }
9553 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554}
9555
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009556static int
9557lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9558 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009560 /* Obscure special case. */
9561 if (c == 0x3A3) {
9562 mapped[0] = handle_capital_sigma(kind, data, length, i);
9563 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009565 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566}
9567
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009568static Py_ssize_t
9569do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009571 Py_ssize_t i, k = 0;
9572 int n_res, j;
9573 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009574
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009575 c = PyUnicode_READ(kind, data, 0);
9576 n_res = _PyUnicode_ToUpperFull(c, mapped);
9577 for (j = 0; j < n_res; j++) {
9578 if (mapped[j] > *maxchar)
9579 *maxchar = mapped[j];
9580 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009582 for (i = 1; i < length; i++) {
9583 c = PyUnicode_READ(kind, data, i);
9584 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9585 for (j = 0; j < n_res; j++) {
9586 if (mapped[j] > *maxchar)
9587 *maxchar = mapped[j];
9588 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009589 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009590 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009591 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592}
9593
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009594static Py_ssize_t
9595do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9596 Py_ssize_t i, k = 0;
9597
9598 for (i = 0; i < length; i++) {
9599 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9600 int n_res, j;
9601 if (Py_UNICODE_ISUPPER(c)) {
9602 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9603 }
9604 else if (Py_UNICODE_ISLOWER(c)) {
9605 n_res = _PyUnicode_ToUpperFull(c, mapped);
9606 }
9607 else {
9608 n_res = 1;
9609 mapped[0] = c;
9610 }
9611 for (j = 0; j < n_res; j++) {
9612 if (mapped[j] > *maxchar)
9613 *maxchar = mapped[j];
9614 res[k++] = mapped[j];
9615 }
9616 }
9617 return k;
9618}
9619
9620static Py_ssize_t
9621do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9622 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009624 Py_ssize_t i, k = 0;
9625
9626 for (i = 0; i < length; i++) {
9627 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9628 int n_res, j;
9629 if (lower)
9630 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9631 else
9632 n_res = _PyUnicode_ToUpperFull(c, mapped);
9633 for (j = 0; j < n_res; j++) {
9634 if (mapped[j] > *maxchar)
9635 *maxchar = mapped[j];
9636 res[k++] = mapped[j];
9637 }
9638 }
9639 return k;
9640}
9641
9642static Py_ssize_t
9643do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9644{
9645 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9646}
9647
9648static Py_ssize_t
9649do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9650{
9651 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9652}
9653
Benjamin Petersone51757f2012-01-12 21:10:29 -05009654static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009655do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9656{
9657 Py_ssize_t i, k = 0;
9658
9659 for (i = 0; i < length; i++) {
9660 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9661 Py_UCS4 mapped[3];
9662 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9663 for (j = 0; j < n_res; j++) {
9664 if (mapped[j] > *maxchar)
9665 *maxchar = mapped[j];
9666 res[k++] = mapped[j];
9667 }
9668 }
9669 return k;
9670}
9671
9672static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009673do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9674{
9675 Py_ssize_t i, k = 0;
9676 int previous_is_cased;
9677
9678 previous_is_cased = 0;
9679 for (i = 0; i < length; i++) {
9680 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9681 Py_UCS4 mapped[3];
9682 int n_res, j;
9683
9684 if (previous_is_cased)
9685 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9686 else
9687 n_res = _PyUnicode_ToTitleFull(c, mapped);
9688
9689 for (j = 0; j < n_res; j++) {
9690 if (mapped[j] > *maxchar)
9691 *maxchar = mapped[j];
9692 res[k++] = mapped[j];
9693 }
9694
9695 previous_is_cased = _PyUnicode_IsCased(c);
9696 }
9697 return k;
9698}
9699
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700static PyObject *
9701case_operation(PyObject *self,
9702 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9703{
9704 PyObject *res = NULL;
9705 Py_ssize_t length, newlength = 0;
9706 int kind, outkind;
9707 void *data, *outdata;
9708 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9709
Benjamin Petersoneea48462012-01-16 14:28:50 -05009710 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711
9712 kind = PyUnicode_KIND(self);
9713 data = PyUnicode_DATA(self);
9714 length = PyUnicode_GET_LENGTH(self);
9715 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9716 if (tmp == NULL)
9717 return PyErr_NoMemory();
9718 newlength = perform(kind, data, length, tmp, &maxchar);
9719 res = PyUnicode_New(newlength, maxchar);
9720 if (res == NULL)
9721 goto leave;
9722 tmpend = tmp + newlength;
9723 outdata = PyUnicode_DATA(res);
9724 outkind = PyUnicode_KIND(res);
9725 switch (outkind) {
9726 case PyUnicode_1BYTE_KIND:
9727 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9728 break;
9729 case PyUnicode_2BYTE_KIND:
9730 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9731 break;
9732 case PyUnicode_4BYTE_KIND:
9733 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9734 break;
9735 default:
9736 assert(0);
9737 break;
9738 }
9739 leave:
9740 PyMem_FREE(tmp);
9741 return res;
9742}
9743
Tim Peters8ce9f162004-08-27 01:49:32 +00009744PyObject *
9745PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009748 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009749 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009750 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009751 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9752 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009753 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009754 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009755 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009757 int use_memcpy;
9758 unsigned char *res_data = NULL, *sep_data = NULL;
9759 PyObject *last_obj;
9760 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761
Tim Peters05eba1f2004-08-27 21:32:02 +00009762 fseq = PySequence_Fast(seq, "");
9763 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009764 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009765 }
9766
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009767 /* NOTE: the following code can't call back into Python code,
9768 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009769 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009770
Tim Peters05eba1f2004-08-27 21:32:02 +00009771 seqlen = PySequence_Fast_GET_SIZE(fseq);
9772 /* If empty sequence, return u"". */
9773 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009774 Py_DECREF(fseq);
9775 Py_INCREF(unicode_empty);
9776 res = unicode_empty;
9777 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009778 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009779
Tim Peters05eba1f2004-08-27 21:32:02 +00009780 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009781 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009782 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009783 if (seqlen == 1) {
9784 if (PyUnicode_CheckExact(items[0])) {
9785 res = items[0];
9786 Py_INCREF(res);
9787 Py_DECREF(fseq);
9788 return res;
9789 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009790 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009791 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009792 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009793 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009794 /* Set up sep and seplen */
9795 if (separator == NULL) {
9796 /* fall back to a blank space separator */
9797 sep = PyUnicode_FromOrdinal(' ');
9798 if (!sep)
9799 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009800 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009801 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009802 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009803 else {
9804 if (!PyUnicode_Check(separator)) {
9805 PyErr_Format(PyExc_TypeError,
9806 "separator: expected str instance,"
9807 " %.80s found",
9808 Py_TYPE(separator)->tp_name);
9809 goto onError;
9810 }
9811 if (PyUnicode_READY(separator))
9812 goto onError;
9813 sep = separator;
9814 seplen = PyUnicode_GET_LENGTH(separator);
9815 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9816 /* inc refcount to keep this code path symmetric with the
9817 above case of a blank separator */
9818 Py_INCREF(sep);
9819 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009820 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009821 }
9822
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009823 /* There are at least two things to join, or else we have a subclass
9824 * of str in the sequence.
9825 * Do a pre-pass to figure out the total amount of space we'll
9826 * need (sz), and see whether all argument are strings.
9827 */
9828 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009829#ifdef Py_DEBUG
9830 use_memcpy = 0;
9831#else
9832 use_memcpy = 1;
9833#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009834 for (i = 0; i < seqlen; i++) {
9835 const Py_ssize_t old_sz = sz;
9836 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009837 if (!PyUnicode_Check(item)) {
9838 PyErr_Format(PyExc_TypeError,
9839 "sequence item %zd: expected str instance,"
9840 " %.80s found",
9841 i, Py_TYPE(item)->tp_name);
9842 goto onError;
9843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 if (PyUnicode_READY(item) == -1)
9845 goto onError;
9846 sz += PyUnicode_GET_LENGTH(item);
9847 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009848 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009849 if (i != 0)
9850 sz += seplen;
9851 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9852 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009853 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009854 goto onError;
9855 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009856 if (use_memcpy && last_obj != NULL) {
9857 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9858 use_memcpy = 0;
9859 }
9860 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009861 }
Tim Petersced69f82003-09-16 20:30:58 +00009862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009864 if (res == NULL)
9865 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009866
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009867 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009868#ifdef Py_DEBUG
9869 use_memcpy = 0;
9870#else
9871 if (use_memcpy) {
9872 res_data = PyUnicode_1BYTE_DATA(res);
9873 kind = PyUnicode_KIND(res);
9874 if (seplen != 0)
9875 sep_data = PyUnicode_1BYTE_DATA(sep);
9876 }
9877#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009879 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009880 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009881 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009882 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009883 if (use_memcpy) {
9884 Py_MEMCPY(res_data,
9885 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009886 kind * seplen);
9887 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009888 }
9889 else {
9890 copy_characters(res, res_offset, sep, 0, seplen);
9891 res_offset += seplen;
9892 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009893 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009894 itemlen = PyUnicode_GET_LENGTH(item);
9895 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009896 if (use_memcpy) {
9897 Py_MEMCPY(res_data,
9898 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009899 kind * itemlen);
9900 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009901 }
9902 else {
9903 copy_characters(res, res_offset, item, 0, itemlen);
9904 res_offset += itemlen;
9905 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009906 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009907 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009908 if (use_memcpy)
9909 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009910 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009911 else
9912 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009913
Tim Peters05eba1f2004-08-27 21:32:02 +00009914 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009916 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918
Benjamin Peterson29060642009-01-31 22:14:21 +00009919 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009920 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009922 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923 return NULL;
9924}
9925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926#define FILL(kind, data, value, start, length) \
9927 do { \
9928 Py_ssize_t i_ = 0; \
9929 assert(kind != PyUnicode_WCHAR_KIND); \
9930 switch ((kind)) { \
9931 case PyUnicode_1BYTE_KIND: { \
9932 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9933 memset(to_, (unsigned char)value, length); \
9934 break; \
9935 } \
9936 case PyUnicode_2BYTE_KIND: { \
9937 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9938 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9939 break; \
9940 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009941 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9943 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9944 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009945 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 } \
9947 } \
9948 } while (0)
9949
Victor Stinner3fe55312012-01-04 00:33:50 +01009950Py_ssize_t
9951PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9952 Py_UCS4 fill_char)
9953{
9954 Py_ssize_t maxlen;
9955 enum PyUnicode_Kind kind;
9956 void *data;
9957
9958 if (!PyUnicode_Check(unicode)) {
9959 PyErr_BadInternalCall();
9960 return -1;
9961 }
9962 if (PyUnicode_READY(unicode) == -1)
9963 return -1;
9964 if (unicode_check_modifiable(unicode))
9965 return -1;
9966
9967 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9968 PyErr_SetString(PyExc_ValueError,
9969 "fill character is bigger than "
9970 "the string maximum character");
9971 return -1;
9972 }
9973
9974 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9975 length = Py_MIN(maxlen, length);
9976 if (length <= 0)
9977 return 0;
9978
9979 kind = PyUnicode_KIND(unicode);
9980 data = PyUnicode_DATA(unicode);
9981 FILL(kind, data, fill_char, start, length);
9982 return length;
9983}
9984
Victor Stinner9310abb2011-10-05 00:59:23 +02009985static PyObject *
9986pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009987 Py_ssize_t left,
9988 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 PyObject *u;
9992 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009993 int kind;
9994 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995
9996 if (left < 0)
9997 left = 0;
9998 if (right < 0)
9999 right = 0;
10000
Victor Stinnerc4b49542011-12-11 22:44:26 +010010001 if (left == 0 && right == 0)
10002 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10005 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010006 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10007 return NULL;
10008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10010 if (fill > maxchar)
10011 maxchar = fill;
10012 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010013 if (!u)
10014 return NULL;
10015
10016 kind = PyUnicode_KIND(u);
10017 data = PyUnicode_DATA(u);
10018 if (left)
10019 FILL(kind, data, fill, 0, left);
10020 if (right)
10021 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010022 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010023 assert(_PyUnicode_CheckConsistency(u, 1));
10024 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025}
10026
Alexander Belopolsky40018472011-02-26 01:02:56 +000010027PyObject *
10028PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031
10032 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010033 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010034 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010035 if (PyUnicode_READY(string) == -1) {
10036 Py_DECREF(string);
10037 return NULL;
10038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010039
Benjamin Petersonead6b532011-12-20 17:23:42 -060010040 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010042 if (PyUnicode_IS_ASCII(string))
10043 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010044 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010045 PyUnicode_GET_LENGTH(string), keepends);
10046 else
10047 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010048 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010049 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 break;
10051 case PyUnicode_2BYTE_KIND:
10052 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010053 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 PyUnicode_GET_LENGTH(string), keepends);
10055 break;
10056 case PyUnicode_4BYTE_KIND:
10057 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010058 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 PyUnicode_GET_LENGTH(string), keepends);
10060 break;
10061 default:
10062 assert(0);
10063 list = 0;
10064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065 Py_DECREF(string);
10066 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010067}
10068
Alexander Belopolsky40018472011-02-26 01:02:56 +000010069static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010070split(PyObject *self,
10071 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010072 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010073{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 int kind1, kind2, kind;
10075 void *buf1, *buf2;
10076 Py_ssize_t len1, len2;
10077 PyObject* out;
10078
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010080 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 if (PyUnicode_READY(self) == -1)
10083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010086 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010088 if (PyUnicode_IS_ASCII(self))
10089 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010090 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010091 PyUnicode_GET_LENGTH(self), maxcount
10092 );
10093 else
10094 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010095 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010096 PyUnicode_GET_LENGTH(self), maxcount
10097 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 case PyUnicode_2BYTE_KIND:
10099 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010100 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 PyUnicode_GET_LENGTH(self), maxcount
10102 );
10103 case PyUnicode_4BYTE_KIND:
10104 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010105 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 PyUnicode_GET_LENGTH(self), maxcount
10107 );
10108 default:
10109 assert(0);
10110 return NULL;
10111 }
10112
10113 if (PyUnicode_READY(substring) == -1)
10114 return NULL;
10115
10116 kind1 = PyUnicode_KIND(self);
10117 kind2 = PyUnicode_KIND(substring);
10118 kind = kind1 > kind2 ? kind1 : kind2;
10119 buf1 = PyUnicode_DATA(self);
10120 buf2 = PyUnicode_DATA(substring);
10121 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010122 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 if (!buf1)
10124 return NULL;
10125 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010126 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 if (!buf2) {
10128 if (kind1 != kind) PyMem_Free(buf1);
10129 return NULL;
10130 }
10131 len1 = PyUnicode_GET_LENGTH(self);
10132 len2 = PyUnicode_GET_LENGTH(substring);
10133
Benjamin Petersonead6b532011-12-20 17:23:42 -060010134 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010136 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10137 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010138 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010139 else
10140 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010141 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 break;
10143 case PyUnicode_2BYTE_KIND:
10144 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010145 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 break;
10147 case PyUnicode_4BYTE_KIND:
10148 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010149 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 break;
10151 default:
10152 out = NULL;
10153 }
10154 if (kind1 != kind)
10155 PyMem_Free(buf1);
10156 if (kind2 != kind)
10157 PyMem_Free(buf2);
10158 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159}
10160
Alexander Belopolsky40018472011-02-26 01:02:56 +000010161static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010162rsplit(PyObject *self,
10163 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010164 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010165{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 int kind1, kind2, kind;
10167 void *buf1, *buf2;
10168 Py_ssize_t len1, len2;
10169 PyObject* out;
10170
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010171 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010172 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 if (PyUnicode_READY(self) == -1)
10175 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010178 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010180 if (PyUnicode_IS_ASCII(self))
10181 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010182 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010183 PyUnicode_GET_LENGTH(self), maxcount
10184 );
10185 else
10186 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010187 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010188 PyUnicode_GET_LENGTH(self), maxcount
10189 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 case PyUnicode_2BYTE_KIND:
10191 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010192 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 PyUnicode_GET_LENGTH(self), maxcount
10194 );
10195 case PyUnicode_4BYTE_KIND:
10196 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010197 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 PyUnicode_GET_LENGTH(self), maxcount
10199 );
10200 default:
10201 assert(0);
10202 return NULL;
10203 }
10204
10205 if (PyUnicode_READY(substring) == -1)
10206 return NULL;
10207
10208 kind1 = PyUnicode_KIND(self);
10209 kind2 = PyUnicode_KIND(substring);
10210 kind = kind1 > kind2 ? kind1 : kind2;
10211 buf1 = PyUnicode_DATA(self);
10212 buf2 = PyUnicode_DATA(substring);
10213 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010214 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 if (!buf1)
10216 return NULL;
10217 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010218 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 if (!buf2) {
10220 if (kind1 != kind) PyMem_Free(buf1);
10221 return NULL;
10222 }
10223 len1 = PyUnicode_GET_LENGTH(self);
10224 len2 = PyUnicode_GET_LENGTH(substring);
10225
Benjamin Petersonead6b532011-12-20 17:23:42 -060010226 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010228 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10229 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010230 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010231 else
10232 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010233 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 break;
10235 case PyUnicode_2BYTE_KIND:
10236 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010237 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 break;
10239 case PyUnicode_4BYTE_KIND:
10240 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010241 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 break;
10243 default:
10244 out = NULL;
10245 }
10246 if (kind1 != kind)
10247 PyMem_Free(buf1);
10248 if (kind2 != kind)
10249 PyMem_Free(buf2);
10250 return out;
10251}
10252
10253static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10255 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010257 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010259 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10260 return asciilib_find(buf1, len1, buf2, len2, offset);
10261 else
10262 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 case PyUnicode_2BYTE_KIND:
10264 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10265 case PyUnicode_4BYTE_KIND:
10266 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10267 }
10268 assert(0);
10269 return -1;
10270}
10271
10272static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010273anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10274 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010276 switch (kind) {
10277 case PyUnicode_1BYTE_KIND:
10278 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10279 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10280 else
10281 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10282 case PyUnicode_2BYTE_KIND:
10283 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10284 case PyUnicode_4BYTE_KIND:
10285 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10286 }
10287 assert(0);
10288 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010289}
10290
Alexander Belopolsky40018472011-02-26 01:02:56 +000010291static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292replace(PyObject *self, PyObject *str1,
10293 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 PyObject *u;
10296 char *sbuf = PyUnicode_DATA(self);
10297 char *buf1 = PyUnicode_DATA(str1);
10298 char *buf2 = PyUnicode_DATA(str2);
10299 int srelease = 0, release1 = 0, release2 = 0;
10300 int skind = PyUnicode_KIND(self);
10301 int kind1 = PyUnicode_KIND(str1);
10302 int kind2 = PyUnicode_KIND(str2);
10303 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10304 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10305 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010306 int mayshrink;
10307 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308
10309 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010310 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010312 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313
Victor Stinner59de0ee2011-10-07 10:01:28 +020010314 if (str1 == str2)
10315 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 if (skind < kind1)
10317 /* substring too wide to be present */
10318 goto nothing;
10319
Victor Stinner49a0a212011-10-12 23:46:10 +020010320 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10321 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10322 /* Replacing str1 with str2 may cause a maxchar reduction in the
10323 result string. */
10324 mayshrink = (maxchar_str2 < maxchar);
10325 maxchar = Py_MAX(maxchar, maxchar_str2);
10326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010328 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010330 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010332 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010333 Py_UCS4 u1, u2;
10334 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010335 Py_ssize_t index, pos;
10336 char *src;
10337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010339 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10340 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010341 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010344 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010346 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010348
10349 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10350 index = 0;
10351 src = sbuf;
10352 while (--maxcount)
10353 {
10354 pos++;
10355 src += pos * PyUnicode_KIND(self);
10356 slen -= pos;
10357 index += pos;
10358 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10359 if (pos < 0)
10360 break;
10361 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10362 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010363 }
10364 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 int rkind = skind;
10366 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010367 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 if (kind1 < rkind) {
10370 /* widen substring */
10371 buf1 = _PyUnicode_AsKind(str1, rkind);
10372 if (!buf1) goto error;
10373 release1 = 1;
10374 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010375 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010376 if (i < 0)
10377 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 if (rkind > kind2) {
10379 /* widen replacement */
10380 buf2 = _PyUnicode_AsKind(str2, rkind);
10381 if (!buf2) goto error;
10382 release2 = 1;
10383 }
10384 else if (rkind < kind2) {
10385 /* widen self and buf1 */
10386 rkind = kind2;
10387 if (release1) PyMem_Free(buf1);
10388 sbuf = _PyUnicode_AsKind(self, rkind);
10389 if (!sbuf) goto error;
10390 srelease = 1;
10391 buf1 = _PyUnicode_AsKind(str1, rkind);
10392 if (!buf1) goto error;
10393 release1 = 1;
10394 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010395 u = PyUnicode_New(slen, maxchar);
10396 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010398 assert(PyUnicode_KIND(u) == rkind);
10399 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010400
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010401 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010402 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010403 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010405 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010407
10408 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010409 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010410 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010411 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010412 if (i == -1)
10413 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010414 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010416 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010418 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010420 }
10421 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422 Py_ssize_t n, i, j, ires;
10423 Py_ssize_t product, new_size;
10424 int rkind = skind;
10425 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010428 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 buf1 = _PyUnicode_AsKind(str1, rkind);
10430 if (!buf1) goto error;
10431 release1 = 1;
10432 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010433 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010434 if (n == 0)
10435 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010437 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 buf2 = _PyUnicode_AsKind(str2, rkind);
10439 if (!buf2) goto error;
10440 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010443 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 rkind = kind2;
10445 sbuf = _PyUnicode_AsKind(self, rkind);
10446 if (!sbuf) goto error;
10447 srelease = 1;
10448 if (release1) PyMem_Free(buf1);
10449 buf1 = _PyUnicode_AsKind(str1, rkind);
10450 if (!buf1) goto error;
10451 release1 = 1;
10452 }
10453 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10454 PyUnicode_GET_LENGTH(str1))); */
10455 product = n * (len2-len1);
10456 if ((product / (len2-len1)) != n) {
10457 PyErr_SetString(PyExc_OverflowError,
10458 "replace string is too long");
10459 goto error;
10460 }
10461 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010462 if (new_size == 0) {
10463 Py_INCREF(unicode_empty);
10464 u = unicode_empty;
10465 goto done;
10466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10468 PyErr_SetString(PyExc_OverflowError,
10469 "replace string is too long");
10470 goto error;
10471 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010472 u = PyUnicode_New(new_size, maxchar);
10473 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010475 assert(PyUnicode_KIND(u) == rkind);
10476 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 ires = i = 0;
10478 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010479 while (n-- > 0) {
10480 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010481 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010482 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010483 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010484 if (j == -1)
10485 break;
10486 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010487 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010488 memcpy(res + rkind * ires,
10489 sbuf + rkind * i,
10490 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010492 }
10493 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010495 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010497 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010501 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010503 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010504 memcpy(res + rkind * ires,
10505 sbuf + rkind * i,
10506 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010507 }
10508 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010509 /* interleave */
10510 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010511 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010513 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010515 if (--n <= 0)
10516 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010517 memcpy(res + rkind * ires,
10518 sbuf + rkind * i,
10519 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 ires++;
10521 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010522 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010523 memcpy(res + rkind * ires,
10524 sbuf + rkind * i,
10525 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010527 }
10528
10529 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010530 unicode_adjust_maxchar(&u);
10531 if (u == NULL)
10532 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010534
10535 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 if (srelease)
10537 PyMem_FREE(sbuf);
10538 if (release1)
10539 PyMem_FREE(buf1);
10540 if (release2)
10541 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010542 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010544
Benjamin Peterson29060642009-01-31 22:14:21 +000010545 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010546 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 if (srelease)
10548 PyMem_FREE(sbuf);
10549 if (release1)
10550 PyMem_FREE(buf1);
10551 if (release2)
10552 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010553 return unicode_result_unchanged(self);
10554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 error:
10556 if (srelease && sbuf)
10557 PyMem_FREE(sbuf);
10558 if (release1 && buf1)
10559 PyMem_FREE(buf1);
10560 if (release2 && buf2)
10561 PyMem_FREE(buf2);
10562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563}
10564
10565/* --- Unicode Object Methods --------------------------------------------- */
10566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010567PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010568 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569\n\
10570Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010571characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010572
10573static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010574unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010576 if (PyUnicode_READY(self) == -1)
10577 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010578 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579}
10580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010581PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010582 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583\n\
10584Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010585have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586
10587static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010588unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010590 if (PyUnicode_READY(self) == -1)
10591 return NULL;
10592 if (PyUnicode_GET_LENGTH(self) == 0)
10593 return unicode_result_unchanged(self);
10594 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595}
10596
Benjamin Petersond5890c82012-01-14 13:23:30 -050010597PyDoc_STRVAR(casefold__doc__,
10598 "S.casefold() -> str\n\
10599\n\
10600Return a version of S suitable for caseless comparisons.");
10601
10602static PyObject *
10603unicode_casefold(PyObject *self)
10604{
10605 if (PyUnicode_READY(self) == -1)
10606 return NULL;
10607 if (PyUnicode_IS_ASCII(self))
10608 return ascii_upper_or_lower(self, 1);
10609 return case_operation(self, do_casefold);
10610}
10611
10612
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010613/* Argument converter. Coerces to a single unicode character */
10614
10615static int
10616convert_uc(PyObject *obj, void *addr)
10617{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010619 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010620
Benjamin Peterson14339b62009-01-31 16:36:08 +000010621 uniobj = PyUnicode_FromObject(obj);
10622 if (uniobj == NULL) {
10623 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010624 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010625 return 0;
10626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010628 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010629 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010630 Py_DECREF(uniobj);
10631 return 0;
10632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010634 Py_DECREF(uniobj);
10635 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010636}
10637
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010638PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010639 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010640\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010641Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010642done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643
10644static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010645unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010647 Py_ssize_t marg, left;
10648 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 Py_UCS4 fillchar = ' ';
10650
Victor Stinnere9a29352011-10-01 02:14:59 +020010651 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653
Benjamin Petersonbac79492012-01-14 13:34:47 -050010654 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655 return NULL;
10656
Victor Stinnerc4b49542011-12-11 22:44:26 +010010657 if (PyUnicode_GET_LENGTH(self) >= width)
10658 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659
Victor Stinnerc4b49542011-12-11 22:44:26 +010010660 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010661 left = marg / 2 + (marg & width & 1);
10662
Victor Stinner9310abb2011-10-05 00:59:23 +020010663 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664}
10665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666/* This function assumes that str1 and str2 are readied by the caller. */
10667
Marc-André Lemburge5034372000-08-08 08:04:29 +000010668static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010669unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010670{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 int kind1, kind2;
10672 void *data1, *data2;
10673 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 kind1 = PyUnicode_KIND(str1);
10676 kind2 = PyUnicode_KIND(str2);
10677 data1 = PyUnicode_DATA(str1);
10678 data2 = PyUnicode_DATA(str2);
10679 len1 = PyUnicode_GET_LENGTH(str1);
10680 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 for (i = 0; i < len1 && i < len2; ++i) {
10683 Py_UCS4 c1, c2;
10684 c1 = PyUnicode_READ(kind1, data1, i);
10685 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010686
10687 if (c1 != c2)
10688 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010689 }
10690
10691 return (len1 < len2) ? -1 : (len1 != len2);
10692}
10693
Alexander Belopolsky40018472011-02-26 01:02:56 +000010694int
10695PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10698 if (PyUnicode_READY(left) == -1 ||
10699 PyUnicode_READY(right) == -1)
10700 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010701 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010703 PyErr_Format(PyExc_TypeError,
10704 "Can't compare %.100s and %.100s",
10705 left->ob_type->tp_name,
10706 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707 return -1;
10708}
10709
Martin v. Löwis5b222132007-06-10 09:51:05 +000010710int
10711PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 Py_ssize_t i;
10714 int kind;
10715 void *data;
10716 Py_UCS4 chr;
10717
Victor Stinner910337b2011-10-03 03:20:16 +020010718 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (PyUnicode_READY(uni) == -1)
10720 return -1;
10721 kind = PyUnicode_KIND(uni);
10722 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010723 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10725 if (chr != str[i])
10726 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010727 /* This check keeps Python strings that end in '\0' from comparing equal
10728 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010730 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010731 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010732 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010733 return 0;
10734}
10735
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010736
Benjamin Peterson29060642009-01-31 22:14:21 +000010737#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010738 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010739
Alexander Belopolsky40018472011-02-26 01:02:56 +000010740PyObject *
10741PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010742{
10743 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010744
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010745 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10746 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 if (PyUnicode_READY(left) == -1 ||
10748 PyUnicode_READY(right) == -1)
10749 return NULL;
10750 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10751 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010752 if (op == Py_EQ) {
10753 Py_INCREF(Py_False);
10754 return Py_False;
10755 }
10756 if (op == Py_NE) {
10757 Py_INCREF(Py_True);
10758 return Py_True;
10759 }
10760 }
10761 if (left == right)
10762 result = 0;
10763 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010764 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010765
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010766 /* Convert the return value to a Boolean */
10767 switch (op) {
10768 case Py_EQ:
10769 v = TEST_COND(result == 0);
10770 break;
10771 case Py_NE:
10772 v = TEST_COND(result != 0);
10773 break;
10774 case Py_LE:
10775 v = TEST_COND(result <= 0);
10776 break;
10777 case Py_GE:
10778 v = TEST_COND(result >= 0);
10779 break;
10780 case Py_LT:
10781 v = TEST_COND(result == -1);
10782 break;
10783 case Py_GT:
10784 v = TEST_COND(result == 1);
10785 break;
10786 default:
10787 PyErr_BadArgument();
10788 return NULL;
10789 }
10790 Py_INCREF(v);
10791 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010792 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010793
Brian Curtindfc80e32011-08-10 20:28:54 -050010794 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010795}
10796
Alexander Belopolsky40018472011-02-26 01:02:56 +000010797int
10798PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010799{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010800 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 int kind1, kind2, kind;
10802 void *buf1, *buf2;
10803 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010804 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010805
10806 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010807 sub = PyUnicode_FromObject(element);
10808 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010809 PyErr_Format(PyExc_TypeError,
10810 "'in <string>' requires string as left operand, not %s",
10811 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010812 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010813 }
10814
Thomas Wouters477c8d52006-05-27 19:21:47 +000010815 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010816 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010817 Py_DECREF(sub);
10818 return -1;
10819 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010820 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10821 Py_DECREF(sub);
10822 Py_DECREF(str);
10823 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 kind1 = PyUnicode_KIND(str);
10826 kind2 = PyUnicode_KIND(sub);
10827 kind = kind1 > kind2 ? kind1 : kind2;
10828 buf1 = PyUnicode_DATA(str);
10829 buf2 = PyUnicode_DATA(sub);
10830 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010831 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 if (!buf1) {
10833 Py_DECREF(sub);
10834 return -1;
10835 }
10836 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010837 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 if (!buf2) {
10839 Py_DECREF(sub);
10840 if (kind1 != kind) PyMem_Free(buf1);
10841 return -1;
10842 }
10843 len1 = PyUnicode_GET_LENGTH(str);
10844 len2 = PyUnicode_GET_LENGTH(sub);
10845
Benjamin Petersonead6b532011-12-20 17:23:42 -060010846 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 case PyUnicode_1BYTE_KIND:
10848 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10849 break;
10850 case PyUnicode_2BYTE_KIND:
10851 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10852 break;
10853 case PyUnicode_4BYTE_KIND:
10854 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10855 break;
10856 default:
10857 result = -1;
10858 assert(0);
10859 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010860
10861 Py_DECREF(str);
10862 Py_DECREF(sub);
10863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 if (kind1 != kind)
10865 PyMem_Free(buf1);
10866 if (kind2 != kind)
10867 PyMem_Free(buf2);
10868
Guido van Rossum403d68b2000-03-13 15:55:09 +000010869 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010870}
10871
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872/* Concat to string or Unicode object giving a new Unicode object. */
10873
Alexander Belopolsky40018472011-02-26 01:02:56 +000010874PyObject *
10875PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010878 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010879 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880
10881 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010882 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010884 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010885 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010887 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888
10889 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010890 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010891 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010894 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010895 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897 }
10898
Victor Stinner488fa492011-12-12 00:01:39 +010010899 u_len = PyUnicode_GET_LENGTH(u);
10900 v_len = PyUnicode_GET_LENGTH(v);
10901 if (u_len > PY_SSIZE_T_MAX - v_len) {
10902 PyErr_SetString(PyExc_OverflowError,
10903 "strings are too large to concat");
10904 goto onError;
10905 }
10906 new_len = u_len + v_len;
10907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010909 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10910 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010911
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010913 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010915 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010916 copy_characters(w, 0, u, 0, u_len);
10917 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918 Py_DECREF(u);
10919 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010920 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922
Benjamin Peterson29060642009-01-31 22:14:21 +000010923 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924 Py_XDECREF(u);
10925 Py_XDECREF(v);
10926 return NULL;
10927}
10928
Walter Dörwald1ab83302007-05-18 17:15:44 +000010929void
Victor Stinner23e56682011-10-03 03:54:37 +020010930PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010931{
Victor Stinner23e56682011-10-03 03:54:37 +020010932 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010933 Py_UCS4 maxchar, maxchar2;
10934 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010935
10936 if (p_left == NULL) {
10937 if (!PyErr_Occurred())
10938 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010939 return;
10940 }
Victor Stinner23e56682011-10-03 03:54:37 +020010941 left = *p_left;
10942 if (right == NULL || !PyUnicode_Check(left)) {
10943 if (!PyErr_Occurred())
10944 PyErr_BadInternalCall();
10945 goto error;
10946 }
10947
Benjamin Petersonbac79492012-01-14 13:34:47 -050010948 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010949 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010950 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010951 goto error;
10952
Victor Stinner488fa492011-12-12 00:01:39 +010010953 /* Shortcuts */
10954 if (left == unicode_empty) {
10955 Py_DECREF(left);
10956 Py_INCREF(right);
10957 *p_left = right;
10958 return;
10959 }
10960 if (right == unicode_empty)
10961 return;
10962
10963 left_len = PyUnicode_GET_LENGTH(left);
10964 right_len = PyUnicode_GET_LENGTH(right);
10965 if (left_len > PY_SSIZE_T_MAX - right_len) {
10966 PyErr_SetString(PyExc_OverflowError,
10967 "strings are too large to concat");
10968 goto error;
10969 }
10970 new_len = left_len + right_len;
10971
10972 if (unicode_modifiable(left)
10973 && PyUnicode_CheckExact(right)
10974 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010975 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10976 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010977 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010978 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010979 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10980 {
10981 /* append inplace */
10982 if (unicode_resize(p_left, new_len) != 0) {
10983 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10984 * deallocated so it cannot be put back into
10985 * 'variable'. The MemoryError is raised when there
10986 * is no value in 'variable', which might (very
10987 * remotely) be a cause of incompatibilities.
10988 */
10989 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010990 }
Victor Stinner488fa492011-12-12 00:01:39 +010010991 /* copy 'right' into the newly allocated area of 'left' */
10992 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010993 }
Victor Stinner488fa492011-12-12 00:01:39 +010010994 else {
10995 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10996 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10997 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010998
Victor Stinner488fa492011-12-12 00:01:39 +010010999 /* Concat the two Unicode strings */
11000 res = PyUnicode_New(new_len, maxchar);
11001 if (res == NULL)
11002 goto error;
11003 copy_characters(res, 0, left, 0, left_len);
11004 copy_characters(res, left_len, right, 0, right_len);
11005 Py_DECREF(left);
11006 *p_left = res;
11007 }
11008 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011009 return;
11010
11011error:
Victor Stinner488fa492011-12-12 00:01:39 +010011012 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011013}
11014
11015void
11016PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11017{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011018 PyUnicode_Append(pleft, right);
11019 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011020}
11021
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011022PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011025Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011026string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011027interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028
11029static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011030unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011032 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011033 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011034 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 int kind1, kind2, kind;
11037 void *buf1, *buf2;
11038 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039
Jesus Ceaac451502011-04-20 17:09:23 +020011040 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11041 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011042 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 kind1 = PyUnicode_KIND(self);
11045 kind2 = PyUnicode_KIND(substring);
11046 kind = kind1 > kind2 ? kind1 : kind2;
11047 buf1 = PyUnicode_DATA(self);
11048 buf2 = PyUnicode_DATA(substring);
11049 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011050 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 if (!buf1) {
11052 Py_DECREF(substring);
11053 return NULL;
11054 }
11055 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011056 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057 if (!buf2) {
11058 Py_DECREF(substring);
11059 if (kind1 != kind) PyMem_Free(buf1);
11060 return NULL;
11061 }
11062 len1 = PyUnicode_GET_LENGTH(self);
11063 len2 = PyUnicode_GET_LENGTH(substring);
11064
11065 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011066 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 case PyUnicode_1BYTE_KIND:
11068 iresult = ucs1lib_count(
11069 ((Py_UCS1*)buf1) + start, end - start,
11070 buf2, len2, PY_SSIZE_T_MAX
11071 );
11072 break;
11073 case PyUnicode_2BYTE_KIND:
11074 iresult = ucs2lib_count(
11075 ((Py_UCS2*)buf1) + start, end - start,
11076 buf2, len2, PY_SSIZE_T_MAX
11077 );
11078 break;
11079 case PyUnicode_4BYTE_KIND:
11080 iresult = ucs4lib_count(
11081 ((Py_UCS4*)buf1) + start, end - start,
11082 buf2, len2, PY_SSIZE_T_MAX
11083 );
11084 break;
11085 default:
11086 assert(0); iresult = 0;
11087 }
11088
11089 result = PyLong_FromSsize_t(iresult);
11090
11091 if (kind1 != kind)
11092 PyMem_Free(buf1);
11093 if (kind2 != kind)
11094 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095
11096 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011097
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098 return result;
11099}
11100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011101PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011102 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011104Encode S using the codec registered for encoding. Default encoding\n\
11105is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011106handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011107a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11108'xmlcharrefreplace' as well as any other name registered with\n\
11109codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110
11111static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011112unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011114 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115 char *encoding = NULL;
11116 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011117
Benjamin Peterson308d6372009-09-18 21:42:35 +000011118 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11119 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011121 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011122}
11123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011124PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011125 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126\n\
11127Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011128If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129
11130static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011131unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011133 Py_ssize_t i, j, line_pos, src_len, incr;
11134 Py_UCS4 ch;
11135 PyObject *u;
11136 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011138 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011139 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140
11141 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011142 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143
Antoine Pitrou22425222011-10-04 19:10:51 +020011144 if (PyUnicode_READY(self) == -1)
11145 return NULL;
11146
Thomas Wouters7e474022000-07-16 12:04:32 +000011147 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011148 src_len = PyUnicode_GET_LENGTH(self);
11149 i = j = line_pos = 0;
11150 kind = PyUnicode_KIND(self);
11151 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011152 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011153 for (; i < src_len; i++) {
11154 ch = PyUnicode_READ(kind, src_data, i);
11155 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011156 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011157 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011158 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011159 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011160 goto overflow;
11161 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011162 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011163 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011166 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011167 goto overflow;
11168 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011170 if (ch == '\n' || ch == '\r')
11171 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011173 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011174 if (!found)
11175 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011176
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011178 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179 if (!u)
11180 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011181 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182
Antoine Pitroue71d5742011-10-04 15:55:09 +020011183 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
Antoine Pitroue71d5742011-10-04 15:55:09 +020011185 for (; i < src_len; i++) {
11186 ch = PyUnicode_READ(kind, src_data, i);
11187 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011188 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011189 incr = tabsize - (line_pos % tabsize);
11190 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011191 FILL(kind, dest_data, ' ', j, incr);
11192 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011193 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011194 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011195 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011196 line_pos++;
11197 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011198 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011199 if (ch == '\n' || ch == '\r')
11200 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011202 }
11203 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011204 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011205
Antoine Pitroue71d5742011-10-04 15:55:09 +020011206 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011207 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209}
11210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011211PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213\n\
11214Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011215such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216arguments start and end are interpreted as in slice notation.\n\
11217\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011218Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219
11220static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011223 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011224 Py_ssize_t start;
11225 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011226 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227
Jesus Ceaac451502011-04-20 17:09:23 +020011228 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11229 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 if (PyUnicode_READY(self) == -1)
11233 return NULL;
11234 if (PyUnicode_READY(substring) == -1)
11235 return NULL;
11236
Victor Stinner7931d9a2011-11-04 00:22:48 +010011237 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238
11239 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 if (result == -2)
11242 return NULL;
11243
Christian Heimes217cfd12007-12-02 14:31:20 +000011244 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245}
11246
11247static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011248unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011250 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11251 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254}
11255
Guido van Rossumc2504932007-09-18 19:42:40 +000011256/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011257 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011258static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011259unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260{
Guido van Rossumc2504932007-09-18 19:42:40 +000011261 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011262 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011263
Benjamin Peterson69e97272012-02-21 11:08:50 -050011264 assert(_Py_HashSecret_Initialized);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 if (_PyUnicode_HASH(self) != -1)
11266 return _PyUnicode_HASH(self);
11267 if (PyUnicode_READY(self) == -1)
11268 return -1;
11269 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011270 /*
11271 We make the hash of the empty string be 0, rather than using
11272 (prefix ^ suffix), since this slightly obfuscates the hash secret
11273 */
11274 if (len == 0) {
11275 _PyUnicode_HASH(self) = 0;
11276 return 0;
11277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278
11279 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011280#define HASH(P) \
11281 x ^= (Py_uhash_t) *P << 7; \
11282 while (--len >= 0) \
11283 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284
Georg Brandl2fb477c2012-02-21 00:33:36 +010011285 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 switch (PyUnicode_KIND(self)) {
11287 case PyUnicode_1BYTE_KIND: {
11288 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11289 HASH(c);
11290 break;
11291 }
11292 case PyUnicode_2BYTE_KIND: {
11293 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11294 HASH(s);
11295 break;
11296 }
11297 default: {
11298 Py_UCS4 *l;
11299 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11300 "Impossible switch case in unicode_hash");
11301 l = PyUnicode_4BYTE_DATA(self);
11302 HASH(l);
11303 break;
11304 }
11305 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011306 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11307 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308
Guido van Rossumc2504932007-09-18 19:42:40 +000011309 if (x == -1)
11310 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011312 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011316PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011317 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011319Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320
11321static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011324 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011325 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011326 Py_ssize_t start;
11327 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328
Jesus Ceaac451502011-04-20 17:09:23 +020011329 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11330 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 if (PyUnicode_READY(self) == -1)
11334 return NULL;
11335 if (PyUnicode_READY(substring) == -1)
11336 return NULL;
11337
Victor Stinner7931d9a2011-11-04 00:22:48 +010011338 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339
11340 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 if (result == -2)
11343 return NULL;
11344
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345 if (result < 0) {
11346 PyErr_SetString(PyExc_ValueError, "substring not found");
11347 return NULL;
11348 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011349
Christian Heimes217cfd12007-12-02 14:31:20 +000011350 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351}
11352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011353PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011354 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011356Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011357at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358
11359static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011360unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 Py_ssize_t i, length;
11363 int kind;
11364 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365 int cased;
11366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 if (PyUnicode_READY(self) == -1)
11368 return NULL;
11369 length = PyUnicode_GET_LENGTH(self);
11370 kind = PyUnicode_KIND(self);
11371 data = PyUnicode_DATA(self);
11372
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (length == 1)
11375 return PyBool_FromLong(
11376 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011378 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011381
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 for (i = 0; i < length; i++) {
11384 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011385
Benjamin Peterson29060642009-01-31 22:14:21 +000011386 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11387 return PyBool_FromLong(0);
11388 else if (!cased && Py_UNICODE_ISLOWER(ch))
11389 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011391 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392}
11393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011394PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011395 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011397Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011398at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399
11400static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011401unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 Py_ssize_t i, length;
11404 int kind;
11405 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406 int cased;
11407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 if (PyUnicode_READY(self) == -1)
11409 return NULL;
11410 length = PyUnicode_GET_LENGTH(self);
11411 kind = PyUnicode_KIND(self);
11412 data = PyUnicode_DATA(self);
11413
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 if (length == 1)
11416 return PyBool_FromLong(
11417 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011419 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011421 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011422
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 for (i = 0; i < length; i++) {
11425 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011426
Benjamin Peterson29060642009-01-31 22:14:21 +000011427 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11428 return PyBool_FromLong(0);
11429 else if (!cased && Py_UNICODE_ISUPPER(ch))
11430 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011432 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433}
11434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011435PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011438Return True if S is a titlecased string and there is at least one\n\
11439character in S, i.e. upper- and titlecase characters may only\n\
11440follow uncased characters and lowercase characters only cased ones.\n\
11441Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
11443static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011444unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 Py_ssize_t i, length;
11447 int kind;
11448 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449 int cased, previous_is_cased;
11450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 if (PyUnicode_READY(self) == -1)
11452 return NULL;
11453 length = PyUnicode_GET_LENGTH(self);
11454 kind = PyUnicode_KIND(self);
11455 data = PyUnicode_DATA(self);
11456
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 if (length == 1) {
11459 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11460 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11461 (Py_UNICODE_ISUPPER(ch) != 0));
11462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011464 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011466 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011467
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 cased = 0;
11469 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 for (i = 0; i < length; i++) {
11471 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011472
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11474 if (previous_is_cased)
11475 return PyBool_FromLong(0);
11476 previous_is_cased = 1;
11477 cased = 1;
11478 }
11479 else if (Py_UNICODE_ISLOWER(ch)) {
11480 if (!previous_is_cased)
11481 return PyBool_FromLong(0);
11482 previous_is_cased = 1;
11483 cased = 1;
11484 }
11485 else
11486 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011488 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489}
11490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011491PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011494Return True if all characters in S are whitespace\n\
11495and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
11497static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011498unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 Py_ssize_t i, length;
11501 int kind;
11502 void *data;
11503
11504 if (PyUnicode_READY(self) == -1)
11505 return NULL;
11506 length = PyUnicode_GET_LENGTH(self);
11507 kind = PyUnicode_KIND(self);
11508 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 if (length == 1)
11512 return PyBool_FromLong(
11513 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011515 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 for (i = 0; i < length; i++) {
11520 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011521 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011524 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525}
11526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011527PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011528 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011529\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011530Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011531and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011532
11533static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011534unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011535{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 Py_ssize_t i, length;
11537 int kind;
11538 void *data;
11539
11540 if (PyUnicode_READY(self) == -1)
11541 return NULL;
11542 length = PyUnicode_GET_LENGTH(self);
11543 kind = PyUnicode_KIND(self);
11544 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011545
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011546 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 if (length == 1)
11548 return PyBool_FromLong(
11549 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011550
11551 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 for (i = 0; i < length; i++) {
11556 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011558 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011559 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011560}
11561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011562PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011564\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011565Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011566and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011567
11568static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011569unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011570{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 int kind;
11572 void *data;
11573 Py_ssize_t len, i;
11574
11575 if (PyUnicode_READY(self) == -1)
11576 return NULL;
11577
11578 kind = PyUnicode_KIND(self);
11579 data = PyUnicode_DATA(self);
11580 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011581
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011582 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 if (len == 1) {
11584 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11585 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11586 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011587
11588 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 for (i = 0; i < len; i++) {
11593 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011594 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011595 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011596 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011597 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011598}
11599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011600PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011603Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011604False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
11606static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011607unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 Py_ssize_t i, length;
11610 int kind;
11611 void *data;
11612
11613 if (PyUnicode_READY(self) == -1)
11614 return NULL;
11615 length = PyUnicode_GET_LENGTH(self);
11616 kind = PyUnicode_KIND(self);
11617 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 if (length == 1)
11621 return PyBool_FromLong(
11622 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011624 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 for (i = 0; i < length; i++) {
11629 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011630 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011632 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633}
11634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011635PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011638Return True if all characters in S are digits\n\
11639and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
11641static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011642unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 Py_ssize_t i, length;
11645 int kind;
11646 void *data;
11647
11648 if (PyUnicode_READY(self) == -1)
11649 return NULL;
11650 length = PyUnicode_GET_LENGTH(self);
11651 kind = PyUnicode_KIND(self);
11652 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 if (length == 1) {
11656 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11657 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11658 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011660 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011662 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 for (i = 0; i < length; i++) {
11665 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011666 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011668 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669}
11670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011671PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011674Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011675False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676
11677static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011678unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 Py_ssize_t i, length;
11681 int kind;
11682 void *data;
11683
11684 if (PyUnicode_READY(self) == -1)
11685 return NULL;
11686 length = PyUnicode_GET_LENGTH(self);
11687 kind = PyUnicode_KIND(self);
11688 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 if (length == 1)
11692 return PyBool_FromLong(
11693 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011695 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 for (i = 0; i < length; i++) {
11700 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011701 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011703 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704}
11705
Martin v. Löwis47383402007-08-15 07:32:56 +000011706int
11707PyUnicode_IsIdentifier(PyObject *self)
11708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 int kind;
11710 void *data;
11711 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011712 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 if (PyUnicode_READY(self) == -1) {
11715 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 }
11718
11719 /* Special case for empty strings */
11720 if (PyUnicode_GET_LENGTH(self) == 0)
11721 return 0;
11722 kind = PyUnicode_KIND(self);
11723 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011724
11725 /* PEP 3131 says that the first character must be in
11726 XID_Start and subsequent characters in XID_Continue,
11727 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011728 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011729 letters, digits, underscore). However, given the current
11730 definition of XID_Start and XID_Continue, it is sufficient
11731 to check just for these, except that _ must be allowed
11732 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011734 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011735 return 0;
11736
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011737 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011740 return 1;
11741}
11742
11743PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011744 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011745\n\
11746Return True if S is a valid identifier according\n\
11747to the language definition.");
11748
11749static PyObject*
11750unicode_isidentifier(PyObject *self)
11751{
11752 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11753}
11754
Georg Brandl559e5d72008-06-11 18:37:52 +000011755PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011757\n\
11758Return True if all characters in S are considered\n\
11759printable in repr() or S is empty, False otherwise.");
11760
11761static PyObject*
11762unicode_isprintable(PyObject *self)
11763{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 Py_ssize_t i, length;
11765 int kind;
11766 void *data;
11767
11768 if (PyUnicode_READY(self) == -1)
11769 return NULL;
11770 length = PyUnicode_GET_LENGTH(self);
11771 kind = PyUnicode_KIND(self);
11772 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011773
11774 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 if (length == 1)
11776 return PyBool_FromLong(
11777 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 for (i = 0; i < length; i++) {
11780 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011781 Py_RETURN_FALSE;
11782 }
11783 }
11784 Py_RETURN_TRUE;
11785}
11786
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011787PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011788 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789\n\
11790Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011791iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792
11793static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011794unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011796 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797}
11798
Martin v. Löwis18e16552006-02-15 17:27:45 +000011799static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011800unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 if (PyUnicode_READY(self) == -1)
11803 return -1;
11804 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805}
11806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011807PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011808 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011810Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011811done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812
11813static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011814unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011816 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 Py_UCS4 fillchar = ' ';
11818
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011819 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820 return NULL;
11821
Benjamin Petersonbac79492012-01-14 13:34:47 -050011822 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824
Victor Stinnerc4b49542011-12-11 22:44:26 +010011825 if (PyUnicode_GET_LENGTH(self) >= width)
11826 return unicode_result_unchanged(self);
11827
11828 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829}
11830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011831PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011834Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835
11836static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011837unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011839 if (PyUnicode_READY(self) == -1)
11840 return NULL;
11841 if (PyUnicode_IS_ASCII(self))
11842 return ascii_upper_or_lower(self, 1);
11843 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844}
11845
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011846#define LEFTSTRIP 0
11847#define RIGHTSTRIP 1
11848#define BOTHSTRIP 2
11849
11850/* Arrays indexed by above */
11851static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11852
11853#define STRIPNAME(i) (stripformat[i]+3)
11854
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011855/* externally visible for str.strip(unicode) */
11856PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011857_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 void *data;
11860 int kind;
11861 Py_ssize_t i, j, len;
11862 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11865 return NULL;
11866
11867 kind = PyUnicode_KIND(self);
11868 data = PyUnicode_DATA(self);
11869 len = PyUnicode_GET_LENGTH(self);
11870 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11871 PyUnicode_DATA(sepobj),
11872 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011873
Benjamin Peterson14339b62009-01-31 16:36:08 +000011874 i = 0;
11875 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 while (i < len &&
11877 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 i++;
11879 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011880 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011881
Benjamin Peterson14339b62009-01-31 16:36:08 +000011882 j = len;
11883 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 do {
11885 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 } while (j >= i &&
11887 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011888 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011889 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011890
Victor Stinner7931d9a2011-11-04 00:22:48 +010011891 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892}
11893
11894PyObject*
11895PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11896{
11897 unsigned char *data;
11898 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011899 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900
Victor Stinnerde636f32011-10-01 03:55:54 +020011901 if (PyUnicode_READY(self) == -1)
11902 return NULL;
11903
11904 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11905
Victor Stinner12bab6d2011-10-01 01:53:49 +020011906 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011907 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908
Victor Stinner12bab6d2011-10-01 01:53:49 +020011909 length = end - start;
11910 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011911 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912
Victor Stinnerde636f32011-10-01 03:55:54 +020011913 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011914 PyErr_SetString(PyExc_IndexError, "string index out of range");
11915 return NULL;
11916 }
11917
Victor Stinnerb9275c12011-10-05 14:01:42 +020011918 if (PyUnicode_IS_ASCII(self)) {
11919 kind = PyUnicode_KIND(self);
11920 data = PyUnicode_1BYTE_DATA(self);
11921 return unicode_fromascii(data + start, length);
11922 }
11923 else {
11924 kind = PyUnicode_KIND(self);
11925 data = PyUnicode_1BYTE_DATA(self);
11926 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011927 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011928 length);
11929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931
11932static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011933do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 int kind;
11936 void *data;
11937 Py_ssize_t len, i, j;
11938
11939 if (PyUnicode_READY(self) == -1)
11940 return NULL;
11941
11942 kind = PyUnicode_KIND(self);
11943 data = PyUnicode_DATA(self);
11944 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011945
Benjamin Peterson14339b62009-01-31 16:36:08 +000011946 i = 0;
11947 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011949 i++;
11950 }
11951 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011952
Benjamin Peterson14339b62009-01-31 16:36:08 +000011953 j = len;
11954 if (striptype != LEFTSTRIP) {
11955 do {
11956 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011958 j++;
11959 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011960
Victor Stinner7931d9a2011-11-04 00:22:48 +010011961 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962}
11963
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011964
11965static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011966do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011967{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011968 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011969
Benjamin Peterson14339b62009-01-31 16:36:08 +000011970 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11971 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011972
Benjamin Peterson14339b62009-01-31 16:36:08 +000011973 if (sep != NULL && sep != Py_None) {
11974 if (PyUnicode_Check(sep))
11975 return _PyUnicode_XStrip(self, striptype, sep);
11976 else {
11977 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011978 "%s arg must be None or str",
11979 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011980 return NULL;
11981 }
11982 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011983
Benjamin Peterson14339b62009-01-31 16:36:08 +000011984 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011985}
11986
11987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011988PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011990\n\
11991Return a copy of the string S with leading and trailing\n\
11992whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011993If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011994
11995static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011996unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011997{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011998 if (PyTuple_GET_SIZE(args) == 0)
11999 return do_strip(self, BOTHSTRIP); /* Common case */
12000 else
12001 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012002}
12003
12004
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012005PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012006 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012007\n\
12008Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012009If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012010
12011static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012012unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012013{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012014 if (PyTuple_GET_SIZE(args) == 0)
12015 return do_strip(self, LEFTSTRIP); /* Common case */
12016 else
12017 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012018}
12019
12020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012021PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012023\n\
12024Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012025If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012026
12027static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012028unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012029{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012030 if (PyTuple_GET_SIZE(args) == 0)
12031 return do_strip(self, RIGHTSTRIP); /* Common case */
12032 else
12033 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012034}
12035
12036
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012038unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012040 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042
Georg Brandl222de0f2009-04-12 12:01:50 +000012043 if (len < 1) {
12044 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012045 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
Victor Stinnerc4b49542011-12-11 22:44:26 +010012048 /* no repeat, return original string */
12049 if (len == 1)
12050 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012051
Benjamin Petersonbac79492012-01-14 13:34:47 -050012052 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 return NULL;
12054
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012055 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012056 PyErr_SetString(PyExc_OverflowError,
12057 "repeated string is too long");
12058 return NULL;
12059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012061
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012062 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063 if (!u)
12064 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012065 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 if (PyUnicode_GET_LENGTH(str) == 1) {
12068 const int kind = PyUnicode_KIND(str);
12069 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012070 if (kind == PyUnicode_1BYTE_KIND) {
12071 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012072 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012073 }
12074 else if (kind == PyUnicode_2BYTE_KIND) {
12075 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012076 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012077 ucs2[n] = fill_char;
12078 } else {
12079 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12080 assert(kind == PyUnicode_4BYTE_KIND);
12081 for (n = 0; n < len; ++n)
12082 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012084 }
12085 else {
12086 /* number of characters copied this far */
12087 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012088 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 char *to = (char *) PyUnicode_DATA(u);
12090 Py_MEMCPY(to, PyUnicode_DATA(str),
12091 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012092 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 n = (done <= nchars-done) ? done : nchars-done;
12094 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012095 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097 }
12098
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012099 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012100 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101}
12102
Alexander Belopolsky40018472011-02-26 01:02:56 +000012103PyObject *
12104PyUnicode_Replace(PyObject *obj,
12105 PyObject *subobj,
12106 PyObject *replobj,
12107 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108{
12109 PyObject *self;
12110 PyObject *str1;
12111 PyObject *str2;
12112 PyObject *result;
12113
12114 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012115 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012118 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012119 Py_DECREF(self);
12120 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121 }
12122 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012123 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012124 Py_DECREF(self);
12125 Py_DECREF(str1);
12126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012128 if (PyUnicode_READY(self) == -1 ||
12129 PyUnicode_READY(str1) == -1 ||
12130 PyUnicode_READY(str2) == -1)
12131 result = NULL;
12132 else
12133 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134 Py_DECREF(self);
12135 Py_DECREF(str1);
12136 Py_DECREF(str2);
12137 return result;
12138}
12139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012140PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012141 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142\n\
12143Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012144old replaced by new. If the optional argument count is\n\
12145given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146
12147static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 PyObject *str1;
12151 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012152 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153 PyObject *result;
12154
Martin v. Löwis18e16552006-02-15 17:27:45 +000012155 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012157 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012158 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012160 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 return NULL;
12162 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012163 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012164 Py_DECREF(str1);
12165 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012166 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012167 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12168 result = NULL;
12169 else
12170 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171
12172 Py_DECREF(str1);
12173 Py_DECREF(str2);
12174 return result;
12175}
12176
Alexander Belopolsky40018472011-02-26 01:02:56 +000012177static PyObject *
12178unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012180 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 Py_ssize_t isize;
12182 Py_ssize_t osize, squote, dquote, i, o;
12183 Py_UCS4 max, quote;
12184 int ikind, okind;
12185 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012187 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012188 return NULL;
12189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 isize = PyUnicode_GET_LENGTH(unicode);
12191 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 /* Compute length of output, quote characters, and
12194 maximum character */
12195 osize = 2; /* quotes */
12196 max = 127;
12197 squote = dquote = 0;
12198 ikind = PyUnicode_KIND(unicode);
12199 for (i = 0; i < isize; i++) {
12200 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12201 switch (ch) {
12202 case '\'': squote++; osize++; break;
12203 case '"': dquote++; osize++; break;
12204 case '\\': case '\t': case '\r': case '\n':
12205 osize += 2; break;
12206 default:
12207 /* Fast-path ASCII */
12208 if (ch < ' ' || ch == 0x7f)
12209 osize += 4; /* \xHH */
12210 else if (ch < 0x7f)
12211 osize++;
12212 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12213 osize++;
12214 max = ch > max ? ch : max;
12215 }
12216 else if (ch < 0x100)
12217 osize += 4; /* \xHH */
12218 else if (ch < 0x10000)
12219 osize += 6; /* \uHHHH */
12220 else
12221 osize += 10; /* \uHHHHHHHH */
12222 }
12223 }
12224
12225 quote = '\'';
12226 if (squote) {
12227 if (dquote)
12228 /* Both squote and dquote present. Use squote,
12229 and escape them */
12230 osize += squote;
12231 else
12232 quote = '"';
12233 }
12234
12235 repr = PyUnicode_New(osize, max);
12236 if (repr == NULL)
12237 return NULL;
12238 okind = PyUnicode_KIND(repr);
12239 odata = PyUnicode_DATA(repr);
12240
12241 PyUnicode_WRITE(okind, odata, 0, quote);
12242 PyUnicode_WRITE(okind, odata, osize-1, quote);
12243
12244 for (i = 0, o = 1; i < isize; i++) {
12245 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012246
12247 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 if ((ch == quote) || (ch == '\\')) {
12249 PyUnicode_WRITE(okind, odata, o++, '\\');
12250 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012251 continue;
12252 }
12253
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012255 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256 PyUnicode_WRITE(okind, odata, o++, '\\');
12257 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012258 }
12259 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 PyUnicode_WRITE(okind, odata, o++, '\\');
12261 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012262 }
12263 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 PyUnicode_WRITE(okind, odata, o++, '\\');
12265 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012266 }
12267
12268 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012269 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 PyUnicode_WRITE(okind, odata, o++, '\\');
12271 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012272 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12273 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012274 }
12275
Georg Brandl559e5d72008-06-11 18:37:52 +000012276 /* Copy ASCII characters as-is */
12277 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012279 }
12280
Benjamin Peterson29060642009-01-31 22:14:21 +000012281 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012282 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012283 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012284 (categories Z* and C* except ASCII space)
12285 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012287 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 if (ch <= 0xff) {
12289 PyUnicode_WRITE(okind, odata, o++, '\\');
12290 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012291 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12292 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012293 }
12294 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 else if (ch >= 0x10000) {
12296 PyUnicode_WRITE(okind, odata, o++, '\\');
12297 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012298 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12299 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12300 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12301 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12302 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12303 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12304 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12305 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012306 }
12307 /* Map 16-bit characters to '\uxxxx' */
12308 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 PyUnicode_WRITE(okind, odata, o++, '\\');
12310 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012311 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12312 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12313 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12314 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012315 }
12316 }
12317 /* Copy characters as-is */
12318 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012320 }
12321 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012322 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012324 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012325 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012326}
12327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012328PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012329 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330\n\
12331Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012332such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333arguments start and end are interpreted as in slice notation.\n\
12334\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012335Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336
12337static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012340 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012341 Py_ssize_t start;
12342 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012343 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344
Jesus Ceaac451502011-04-20 17:09:23 +020012345 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12346 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012347 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 if (PyUnicode_READY(self) == -1)
12350 return NULL;
12351 if (PyUnicode_READY(substring) == -1)
12352 return NULL;
12353
Victor Stinner7931d9a2011-11-04 00:22:48 +010012354 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355
12356 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 if (result == -2)
12359 return NULL;
12360
Christian Heimes217cfd12007-12-02 14:31:20 +000012361 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362}
12363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012364PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012365 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012367Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368
12369static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012372 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012373 Py_ssize_t start;
12374 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012375 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376
Jesus Ceaac451502011-04-20 17:09:23 +020012377 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12378 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381 if (PyUnicode_READY(self) == -1)
12382 return NULL;
12383 if (PyUnicode_READY(substring) == -1)
12384 return NULL;
12385
Victor Stinner7931d9a2011-11-04 00:22:48 +010012386 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387
12388 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 if (result == -2)
12391 return NULL;
12392
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393 if (result < 0) {
12394 PyErr_SetString(PyExc_ValueError, "substring not found");
12395 return NULL;
12396 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397
Christian Heimes217cfd12007-12-02 14:31:20 +000012398 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012399}
12400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012401PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012402 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012403\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012404Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012405done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406
12407static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012408unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012410 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 Py_UCS4 fillchar = ' ';
12412
Victor Stinnere9a29352011-10-01 02:14:59 +020012413 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012415
Benjamin Petersonbac79492012-01-14 13:34:47 -050012416 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417 return NULL;
12418
Victor Stinnerc4b49542011-12-11 22:44:26 +010012419 if (PyUnicode_GET_LENGTH(self) >= width)
12420 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421
Victor Stinnerc4b49542011-12-11 22:44:26 +010012422 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012423}
12424
Alexander Belopolsky40018472011-02-26 01:02:56 +000012425PyObject *
12426PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427{
12428 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012429
Guido van Rossumd57fd912000-03-10 22:53:23 +000012430 s = PyUnicode_FromObject(s);
12431 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012432 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012433 if (sep != NULL) {
12434 sep = PyUnicode_FromObject(sep);
12435 if (sep == NULL) {
12436 Py_DECREF(s);
12437 return NULL;
12438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012439 }
12440
Victor Stinner9310abb2011-10-05 00:59:23 +020012441 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442
12443 Py_DECREF(s);
12444 Py_XDECREF(sep);
12445 return result;
12446}
12447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012448PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012449 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450\n\
12451Return a list of the words in S, using sep as the\n\
12452delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012453splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012454whitespace string is a separator and empty strings are\n\
12455removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012456
12457static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012458unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012459{
12460 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012461 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462
Martin v. Löwis18e16552006-02-15 17:27:45 +000012463 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012464 return NULL;
12465
12466 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012467 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012469 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012471 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472}
12473
Thomas Wouters477c8d52006-05-27 19:21:47 +000012474PyObject *
12475PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12476{
12477 PyObject* str_obj;
12478 PyObject* sep_obj;
12479 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 int kind1, kind2, kind;
12481 void *buf1 = NULL, *buf2 = NULL;
12482 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012483
12484 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012485 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012487 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012488 if (!sep_obj) {
12489 Py_DECREF(str_obj);
12490 return NULL;
12491 }
12492 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12493 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012494 Py_DECREF(str_obj);
12495 return NULL;
12496 }
12497
Victor Stinner14f8f022011-10-05 20:58:25 +020012498 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012500 kind = Py_MAX(kind1, kind2);
12501 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012503 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 if (!buf1)
12505 goto onError;
12506 buf2 = PyUnicode_DATA(sep_obj);
12507 if (kind2 != kind)
12508 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12509 if (!buf2)
12510 goto onError;
12511 len1 = PyUnicode_GET_LENGTH(str_obj);
12512 len2 = PyUnicode_GET_LENGTH(sep_obj);
12513
Benjamin Petersonead6b532011-12-20 17:23:42 -060012514 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012516 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12517 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12518 else
12519 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 break;
12521 case PyUnicode_2BYTE_KIND:
12522 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12523 break;
12524 case PyUnicode_4BYTE_KIND:
12525 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12526 break;
12527 default:
12528 assert(0);
12529 out = 0;
12530 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012531
12532 Py_DECREF(sep_obj);
12533 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 if (kind1 != kind)
12535 PyMem_Free(buf1);
12536 if (kind2 != kind)
12537 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012538
12539 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 onError:
12541 Py_DECREF(sep_obj);
12542 Py_DECREF(str_obj);
12543 if (kind1 != kind && buf1)
12544 PyMem_Free(buf1);
12545 if (kind2 != kind && buf2)
12546 PyMem_Free(buf2);
12547 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012548}
12549
12550
12551PyObject *
12552PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12553{
12554 PyObject* str_obj;
12555 PyObject* sep_obj;
12556 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 int kind1, kind2, kind;
12558 void *buf1 = NULL, *buf2 = NULL;
12559 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012560
12561 str_obj = PyUnicode_FromObject(str_in);
12562 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012563 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012564 sep_obj = PyUnicode_FromObject(sep_in);
12565 if (!sep_obj) {
12566 Py_DECREF(str_obj);
12567 return NULL;
12568 }
12569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 kind1 = PyUnicode_KIND(str_in);
12571 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012572 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 buf1 = PyUnicode_DATA(str_in);
12574 if (kind1 != kind)
12575 buf1 = _PyUnicode_AsKind(str_in, kind);
12576 if (!buf1)
12577 goto onError;
12578 buf2 = PyUnicode_DATA(sep_obj);
12579 if (kind2 != kind)
12580 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12581 if (!buf2)
12582 goto onError;
12583 len1 = PyUnicode_GET_LENGTH(str_obj);
12584 len2 = PyUnicode_GET_LENGTH(sep_obj);
12585
Benjamin Petersonead6b532011-12-20 17:23:42 -060012586 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012588 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12589 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12590 else
12591 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012592 break;
12593 case PyUnicode_2BYTE_KIND:
12594 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12595 break;
12596 case PyUnicode_4BYTE_KIND:
12597 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12598 break;
12599 default:
12600 assert(0);
12601 out = 0;
12602 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012603
12604 Py_DECREF(sep_obj);
12605 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 if (kind1 != kind)
12607 PyMem_Free(buf1);
12608 if (kind2 != kind)
12609 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012610
12611 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 onError:
12613 Py_DECREF(sep_obj);
12614 Py_DECREF(str_obj);
12615 if (kind1 != kind && buf1)
12616 PyMem_Free(buf1);
12617 if (kind2 != kind && buf2)
12618 PyMem_Free(buf2);
12619 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012620}
12621
12622PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012623 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012624\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012625Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012626the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012627found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012628
12629static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012630unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012631{
Victor Stinner9310abb2011-10-05 00:59:23 +020012632 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012633}
12634
12635PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012636 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012637\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012638Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012639the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012640separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012641
12642static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012643unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012644{
Victor Stinner9310abb2011-10-05 00:59:23 +020012645 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012646}
12647
Alexander Belopolsky40018472011-02-26 01:02:56 +000012648PyObject *
12649PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012650{
12651 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012652
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012653 s = PyUnicode_FromObject(s);
12654 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012655 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012656 if (sep != NULL) {
12657 sep = PyUnicode_FromObject(sep);
12658 if (sep == NULL) {
12659 Py_DECREF(s);
12660 return NULL;
12661 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012662 }
12663
Victor Stinner9310abb2011-10-05 00:59:23 +020012664 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012665
12666 Py_DECREF(s);
12667 Py_XDECREF(sep);
12668 return result;
12669}
12670
12671PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012672 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012673\n\
12674Return a list of the words in S, using sep as the\n\
12675delimiter string, starting at the end of the string and\n\
12676working to the front. If maxsplit is given, at most maxsplit\n\
12677splits are done. If sep is not specified, any whitespace string\n\
12678is a separator.");
12679
12680static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012681unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012682{
12683 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012684 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012685
Martin v. Löwis18e16552006-02-15 17:27:45 +000012686 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012687 return NULL;
12688
12689 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012690 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012691 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012692 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012693 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012694 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012695}
12696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012697PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012698 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699\n\
12700Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012701Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012702is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703
12704static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012705unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012707 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012708 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012710 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12711 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712 return NULL;
12713
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012714 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715}
12716
12717static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012718PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012720 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721}
12722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012723PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012724 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725\n\
12726Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012727and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728
12729static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012730unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012732 if (PyUnicode_READY(self) == -1)
12733 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012734 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735}
12736
Georg Brandlceee0772007-11-27 23:48:05 +000012737PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012738 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012739\n\
12740Return a translation table usable for str.translate().\n\
12741If there is only one argument, it must be a dictionary mapping Unicode\n\
12742ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012743Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012744If there are two arguments, they must be strings of equal length, and\n\
12745in the resulting dictionary, each character in x will be mapped to the\n\
12746character at the same position in y. If there is a third argument, it\n\
12747must be a string, whose characters will be mapped to None in the result.");
12748
12749static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012750unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012751{
12752 PyObject *x, *y = NULL, *z = NULL;
12753 PyObject *new = NULL, *key, *value;
12754 Py_ssize_t i = 0;
12755 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012756
Georg Brandlceee0772007-11-27 23:48:05 +000012757 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12758 return NULL;
12759 new = PyDict_New();
12760 if (!new)
12761 return NULL;
12762 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 int x_kind, y_kind, z_kind;
12764 void *x_data, *y_data, *z_data;
12765
Georg Brandlceee0772007-11-27 23:48:05 +000012766 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012767 if (!PyUnicode_Check(x)) {
12768 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12769 "be a string if there is a second argument");
12770 goto err;
12771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012773 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12774 "arguments must have equal length");
12775 goto err;
12776 }
12777 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012778 x_kind = PyUnicode_KIND(x);
12779 y_kind = PyUnicode_KIND(y);
12780 x_data = PyUnicode_DATA(x);
12781 y_data = PyUnicode_DATA(y);
12782 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12783 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012784 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012785 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012786 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012787 if (!value) {
12788 Py_DECREF(key);
12789 goto err;
12790 }
Georg Brandlceee0772007-11-27 23:48:05 +000012791 res = PyDict_SetItem(new, key, value);
12792 Py_DECREF(key);
12793 Py_DECREF(value);
12794 if (res < 0)
12795 goto err;
12796 }
12797 /* create entries for deleting chars in z */
12798 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799 z_kind = PyUnicode_KIND(z);
12800 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012801 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012803 if (!key)
12804 goto err;
12805 res = PyDict_SetItem(new, key, Py_None);
12806 Py_DECREF(key);
12807 if (res < 0)
12808 goto err;
12809 }
12810 }
12811 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812 int kind;
12813 void *data;
12814
Georg Brandlceee0772007-11-27 23:48:05 +000012815 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012816 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012817 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12818 "to maketrans it must be a dict");
12819 goto err;
12820 }
12821 /* copy entries into the new dict, converting string keys to int keys */
12822 while (PyDict_Next(x, &i, &key, &value)) {
12823 if (PyUnicode_Check(key)) {
12824 /* convert string keys to integer keys */
12825 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012826 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012827 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12828 "table must be of length 1");
12829 goto err;
12830 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831 kind = PyUnicode_KIND(key);
12832 data = PyUnicode_DATA(key);
12833 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012834 if (!newkey)
12835 goto err;
12836 res = PyDict_SetItem(new, newkey, value);
12837 Py_DECREF(newkey);
12838 if (res < 0)
12839 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012840 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012841 /* just keep integer keys */
12842 if (PyDict_SetItem(new, key, value) < 0)
12843 goto err;
12844 } else {
12845 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12846 "be strings or integers");
12847 goto err;
12848 }
12849 }
12850 }
12851 return new;
12852 err:
12853 Py_DECREF(new);
12854 return NULL;
12855}
12856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012857PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012858 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859\n\
12860Return a copy of the string S, where all characters have been mapped\n\
12861through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012862Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012863Unmapped characters are left untouched. Characters mapped to None\n\
12864are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865
12866static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870}
12871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012872PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012873 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012875Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876
12877static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012878unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012880 if (PyUnicode_READY(self) == -1)
12881 return NULL;
12882 if (PyUnicode_IS_ASCII(self))
12883 return ascii_upper_or_lower(self, 0);
12884 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885}
12886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012887PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012888 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012890Pad a numeric string S with zeros on the left, to fill a field\n\
12891of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892
12893static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012894unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012896 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012897 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012898 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 int kind;
12900 void *data;
12901 Py_UCS4 chr;
12902
Martin v. Löwis18e16552006-02-15 17:27:45 +000012903 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904 return NULL;
12905
Benjamin Petersonbac79492012-01-14 13:34:47 -050012906 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012907 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012908
Victor Stinnerc4b49542011-12-11 22:44:26 +010012909 if (PyUnicode_GET_LENGTH(self) >= width)
12910 return unicode_result_unchanged(self);
12911
12912 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012913
12914 u = pad(self, fill, 0, '0');
12915
Walter Dörwald068325e2002-04-15 13:36:47 +000012916 if (u == NULL)
12917 return NULL;
12918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919 kind = PyUnicode_KIND(u);
12920 data = PyUnicode_DATA(u);
12921 chr = PyUnicode_READ(kind, data, fill);
12922
12923 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012924 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 PyUnicode_WRITE(kind, data, 0, chr);
12926 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927 }
12928
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012929 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012930 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012931}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932
12933#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012934static PyObject *
12935unicode__decimal2ascii(PyObject *self)
12936{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012938}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939#endif
12940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012941PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012942 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012944Return True if S starts with the specified prefix, False otherwise.\n\
12945With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012946With optional end, stop comparing S at that position.\n\
12947prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948
12949static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012950unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012951 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012953 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012954 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012955 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012956 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012957 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958
Jesus Ceaac451502011-04-20 17:09:23 +020012959 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012960 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012961 if (PyTuple_Check(subobj)) {
12962 Py_ssize_t i;
12963 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012964 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012965 if (substring == NULL)
12966 return NULL;
12967 result = tailmatch(self, substring, start, end, -1);
12968 Py_DECREF(substring);
12969 if (result) {
12970 Py_RETURN_TRUE;
12971 }
12972 }
12973 /* nothing matched */
12974 Py_RETURN_FALSE;
12975 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012976 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012977 if (substring == NULL) {
12978 if (PyErr_ExceptionMatches(PyExc_TypeError))
12979 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12980 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012981 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012982 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012983 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012985 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986}
12987
12988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012989PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012990 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012991\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012992Return True if S ends with the specified suffix, False otherwise.\n\
12993With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012994With optional end, stop comparing S at that position.\n\
12995suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012996
12997static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012998unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012999 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013000{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013001 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013002 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013003 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013004 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013005 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006
Jesus Ceaac451502011-04-20 17:09:23 +020013007 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013008 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013009 if (PyTuple_Check(subobj)) {
13010 Py_ssize_t i;
13011 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013012 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013013 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013014 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013015 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013016 result = tailmatch(self, substring, start, end, +1);
13017 Py_DECREF(substring);
13018 if (result) {
13019 Py_RETURN_TRUE;
13020 }
13021 }
13022 Py_RETURN_FALSE;
13023 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013024 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013025 if (substring == NULL) {
13026 if (PyErr_ExceptionMatches(PyExc_TypeError))
13027 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13028 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013029 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013030 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013031 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013033 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013034}
13035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013037
13038PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013039 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013040\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013041Return a formatted version of S, using substitutions from args and kwargs.\n\
13042The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013043
Eric Smith27bbca62010-11-04 17:06:58 +000013044PyDoc_STRVAR(format_map__doc__,
13045 "S.format_map(mapping) -> str\n\
13046\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013047Return a formatted version of S, using substitutions from mapping.\n\
13048The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013049
Eric Smith4a7d76d2008-05-30 18:10:19 +000013050static PyObject *
13051unicode__format__(PyObject* self, PyObject* args)
13052{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013053 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013054
13055 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13056 return NULL;
13057
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013058 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013060 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013061}
13062
Eric Smith8c663262007-08-25 02:26:07 +000013063PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013064 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013065\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013066Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013067
13068static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013069unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013071 Py_ssize_t size;
13072
13073 /* If it's a compact object, account for base structure +
13074 character data. */
13075 if (PyUnicode_IS_COMPACT_ASCII(v))
13076 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13077 else if (PyUnicode_IS_COMPACT(v))
13078 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013079 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013080 else {
13081 /* If it is a two-block object, account for base object, and
13082 for character block if present. */
13083 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013084 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013086 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 }
13088 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013089 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013090 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013091 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013092 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013093 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094
13095 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013096}
13097
13098PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013099 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013100
13101static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013102unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013103{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013104 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 if (!copy)
13106 return NULL;
13107 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013108}
13109
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110static PyMethodDef unicode_methods[] = {
13111
13112 /* Order is according to common usage: often used methods should
13113 appear first, since lookup is done sequentially. */
13114
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013115 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013116 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13117 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013118 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013119 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13120 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013121 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013122 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13123 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13124 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13125 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13126 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013127 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013128 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13129 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13130 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013131 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013132 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13133 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13134 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013135 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013136 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013137 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013138 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013139 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13140 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13141 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13142 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13143 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13144 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13145 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13146 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13147 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13148 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13149 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13150 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13151 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13152 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013153 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013154 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013155 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013156 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013157 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013158 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013159 {"maketrans", (PyCFunction) unicode_maketrans,
13160 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013161 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013162#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013163 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013164 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165#endif
13166
Benjamin Peterson14339b62009-01-31 16:36:08 +000013167 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168 {NULL, NULL}
13169};
13170
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013171static PyObject *
13172unicode_mod(PyObject *v, PyObject *w)
13173{
Brian Curtindfc80e32011-08-10 20:28:54 -050013174 if (!PyUnicode_Check(v))
13175 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013176 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013177}
13178
13179static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013180 0, /*nb_add*/
13181 0, /*nb_subtract*/
13182 0, /*nb_multiply*/
13183 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013184};
13185
Guido van Rossumd57fd912000-03-10 22:53:23 +000013186static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013187 (lenfunc) unicode_length, /* sq_length */
13188 PyUnicode_Concat, /* sq_concat */
13189 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13190 (ssizeargfunc) unicode_getitem, /* sq_item */
13191 0, /* sq_slice */
13192 0, /* sq_ass_item */
13193 0, /* sq_ass_slice */
13194 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013195};
13196
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013197static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013198unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013199{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 if (PyUnicode_READY(self) == -1)
13201 return NULL;
13202
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013203 if (PyIndex_Check(item)) {
13204 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013205 if (i == -1 && PyErr_Occurred())
13206 return NULL;
13207 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013209 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013210 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013211 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013212 PyObject *result;
13213 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013214 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013215 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013217 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013218 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013219 return NULL;
13220 }
13221
13222 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013223 Py_INCREF(unicode_empty);
13224 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013225 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013226 slicelength == PyUnicode_GET_LENGTH(self)) {
13227 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013228 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013229 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013230 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013231 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013232 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013233 src_kind = PyUnicode_KIND(self);
13234 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013235 if (!PyUnicode_IS_ASCII(self)) {
13236 kind_limit = kind_maxchar_limit(src_kind);
13237 max_char = 0;
13238 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13239 ch = PyUnicode_READ(src_kind, src_data, cur);
13240 if (ch > max_char) {
13241 max_char = ch;
13242 if (max_char >= kind_limit)
13243 break;
13244 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013245 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013246 }
Victor Stinner55c99112011-10-13 01:17:06 +020013247 else
13248 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013249 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013250 if (result == NULL)
13251 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013252 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013253 dest_data = PyUnicode_DATA(result);
13254
13255 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013256 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13257 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013258 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013259 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013260 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013261 } else {
13262 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13263 return NULL;
13264 }
13265}
13266
13267static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013268 (lenfunc)unicode_length, /* mp_length */
13269 (binaryfunc)unicode_subscript, /* mp_subscript */
13270 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013271};
13272
Guido van Rossumd57fd912000-03-10 22:53:23 +000013273
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274/* Helpers for PyUnicode_Format() */
13275
13276static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013277getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013279 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013281 (*p_argidx)++;
13282 if (arglen < 0)
13283 return args;
13284 else
13285 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286 }
13287 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013288 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289 return NULL;
13290}
13291
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013292/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013293
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013294static PyObject *
13295formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013297 char *p;
13298 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013299 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013300
Guido van Rossumd57fd912000-03-10 22:53:23 +000013301 x = PyFloat_AsDouble(v);
13302 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013303 return NULL;
13304
Guido van Rossumd57fd912000-03-10 22:53:23 +000013305 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013306 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013307
Eric Smith0923d1d2009-04-16 20:16:10 +000013308 p = PyOS_double_to_string(x, type, prec,
13309 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013310 if (p == NULL)
13311 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013313 PyMem_Free(p);
13314 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013315}
13316
Tim Peters38fd5b62000-09-21 05:43:11 +000013317static PyObject*
13318formatlong(PyObject *val, int flags, int prec, int type)
13319{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013320 char *buf;
13321 int len;
13322 PyObject *str; /* temporary string object. */
13323 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013324
Benjamin Peterson14339b62009-01-31 16:36:08 +000013325 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13326 if (!str)
13327 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013328 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013329 Py_DECREF(str);
13330 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013331}
13332
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013333static Py_UCS4
13334formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013336 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013337 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013338 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013339 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013340 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013341 goto onError;
13342 }
13343 else {
13344 /* Integer input truncated to a character */
13345 long x;
13346 x = PyLong_AsLong(v);
13347 if (x == -1 && PyErr_Occurred())
13348 goto onError;
13349
Victor Stinner8faf8212011-12-08 22:14:11 +010013350 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013351 PyErr_SetString(PyExc_OverflowError,
13352 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013353 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013354 }
13355
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013356 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013357 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013358
Benjamin Peterson29060642009-01-31 22:14:21 +000013359 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013360 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013361 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013362 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013363}
13364
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013365static int
13366repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13367{
13368 int r;
13369 assert(count > 0);
13370 assert(PyUnicode_Check(obj));
13371 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013372 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013373 if (repeated == NULL)
13374 return -1;
13375 r = _PyAccu_Accumulate(acc, repeated);
13376 Py_DECREF(repeated);
13377 return r;
13378 }
13379 else {
13380 do {
13381 if (_PyAccu_Accumulate(acc, obj))
13382 return -1;
13383 } while (--count);
13384 return 0;
13385 }
13386}
13387
Alexander Belopolsky40018472011-02-26 01:02:56 +000013388PyObject *
13389PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013390{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013391 void *fmt;
13392 int fmtkind;
13393 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013394 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013395 int r;
13396 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013397 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013399 PyObject *temp = NULL;
13400 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013401 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013402 _PyAccu acc;
13403 static PyObject *plus, *minus, *blank, *zero, *percent;
13404
13405 if (!plus && !(plus = get_latin1_char('+')))
13406 return NULL;
13407 if (!minus && !(minus = get_latin1_char('-')))
13408 return NULL;
13409 if (!blank && !(blank = get_latin1_char(' ')))
13410 return NULL;
13411 if (!zero && !(zero = get_latin1_char('0')))
13412 return NULL;
13413 if (!percent && !(percent = get_latin1_char('%')))
13414 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013415
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 PyErr_BadInternalCall();
13418 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013419 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013420 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013421 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013423 if (PyUnicode_READY(uformat) == -1)
13424 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013425 if (_PyAccu_Init(&acc))
13426 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013427 fmt = PyUnicode_DATA(uformat);
13428 fmtkind = PyUnicode_KIND(uformat);
13429 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13430 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013433 arglen = PyTuple_Size(args);
13434 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013435 }
13436 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013437 arglen = -1;
13438 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013440 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013441 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013443
13444 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013445 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013446 PyObject *nonfmt;
13447 Py_ssize_t nonfmtpos;
13448 nonfmtpos = fmtpos++;
13449 while (fmtcnt >= 0 &&
13450 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13451 fmtpos++;
13452 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013453 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013454 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013455 if (nonfmt == NULL)
13456 goto onError;
13457 r = _PyAccu_Accumulate(&acc, nonfmt);
13458 Py_DECREF(nonfmt);
13459 if (r)
13460 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013461 }
13462 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013463 /* Got a format specifier */
13464 int flags = 0;
13465 Py_ssize_t width = -1;
13466 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013467 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013468 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013469 int isnumok;
13470 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013471 void *pbuf = NULL;
13472 Py_ssize_t pindex, len;
13473 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013475 fmtpos++;
13476 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13477 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013478 Py_ssize_t keylen;
13479 PyObject *key;
13480 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013481
Benjamin Peterson29060642009-01-31 22:14:21 +000013482 if (dict == NULL) {
13483 PyErr_SetString(PyExc_TypeError,
13484 "format requires a mapping");
13485 goto onError;
13486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013487 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013488 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013489 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013490 /* Skip over balanced parentheses */
13491 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013492 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013493 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013494 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013495 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013496 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013498 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013499 if (fmtcnt < 0 || pcount > 0) {
13500 PyErr_SetString(PyExc_ValueError,
13501 "incomplete format key");
13502 goto onError;
13503 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013504 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013505 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 if (key == NULL)
13507 goto onError;
13508 if (args_owned) {
13509 Py_DECREF(args);
13510 args_owned = 0;
13511 }
13512 args = PyObject_GetItem(dict, key);
13513 Py_DECREF(key);
13514 if (args == NULL) {
13515 goto onError;
13516 }
13517 args_owned = 1;
13518 arglen = -1;
13519 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013520 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013522 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013523 case '-': flags |= F_LJUST; continue;
13524 case '+': flags |= F_SIGN; continue;
13525 case ' ': flags |= F_BLANK; continue;
13526 case '#': flags |= F_ALT; continue;
13527 case '0': flags |= F_ZERO; continue;
13528 }
13529 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013530 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013531 if (c == '*') {
13532 v = getnextarg(args, arglen, &argidx);
13533 if (v == NULL)
13534 goto onError;
13535 if (!PyLong_Check(v)) {
13536 PyErr_SetString(PyExc_TypeError,
13537 "* wants int");
13538 goto onError;
13539 }
13540 width = PyLong_AsLong(v);
13541 if (width == -1 && PyErr_Occurred())
13542 goto onError;
13543 if (width < 0) {
13544 flags |= F_LJUST;
13545 width = -width;
13546 }
13547 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013548 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013549 }
13550 else if (c >= '0' && c <= '9') {
13551 width = c - '0';
13552 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013553 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013554 if (c < '0' || c > '9')
13555 break;
13556 if ((width*10) / 10 != width) {
13557 PyErr_SetString(PyExc_ValueError,
13558 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013559 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 }
13561 width = width*10 + (c - '0');
13562 }
13563 }
13564 if (c == '.') {
13565 prec = 0;
13566 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013567 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013568 if (c == '*') {
13569 v = getnextarg(args, arglen, &argidx);
13570 if (v == NULL)
13571 goto onError;
13572 if (!PyLong_Check(v)) {
13573 PyErr_SetString(PyExc_TypeError,
13574 "* wants int");
13575 goto onError;
13576 }
13577 prec = PyLong_AsLong(v);
13578 if (prec == -1 && PyErr_Occurred())
13579 goto onError;
13580 if (prec < 0)
13581 prec = 0;
13582 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013583 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 }
13585 else if (c >= '0' && c <= '9') {
13586 prec = c - '0';
13587 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013588 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013589 if (c < '0' || c > '9')
13590 break;
13591 if ((prec*10) / 10 != prec) {
13592 PyErr_SetString(PyExc_ValueError,
13593 "prec too big");
13594 goto onError;
13595 }
13596 prec = prec*10 + (c - '0');
13597 }
13598 }
13599 } /* prec */
13600 if (fmtcnt >= 0) {
13601 if (c == 'h' || c == 'l' || c == 'L') {
13602 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013603 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013604 }
13605 }
13606 if (fmtcnt < 0) {
13607 PyErr_SetString(PyExc_ValueError,
13608 "incomplete format");
13609 goto onError;
13610 }
13611 if (c != '%') {
13612 v = getnextarg(args, arglen, &argidx);
13613 if (v == NULL)
13614 goto onError;
13615 }
13616 sign = 0;
13617 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013618 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013619 switch (c) {
13620
13621 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013622 _PyAccu_Accumulate(&acc, percent);
13623 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013624
13625 case 's':
13626 case 'r':
13627 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013628 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013629 temp = v;
13630 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013631 }
13632 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013633 if (c == 's')
13634 temp = PyObject_Str(v);
13635 else if (c == 'r')
13636 temp = PyObject_Repr(v);
13637 else
13638 temp = PyObject_ASCII(v);
13639 if (temp == NULL)
13640 goto onError;
13641 if (PyUnicode_Check(temp))
13642 /* nothing to do */;
13643 else {
13644 Py_DECREF(temp);
13645 PyErr_SetString(PyExc_TypeError,
13646 "%s argument has non-string str()");
13647 goto onError;
13648 }
13649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013650 if (PyUnicode_READY(temp) == -1) {
13651 Py_CLEAR(temp);
13652 goto onError;
13653 }
13654 pbuf = PyUnicode_DATA(temp);
13655 kind = PyUnicode_KIND(temp);
13656 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013657 if (prec >= 0 && len > prec)
13658 len = prec;
13659 break;
13660
13661 case 'i':
13662 case 'd':
13663 case 'u':
13664 case 'o':
13665 case 'x':
13666 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013667 isnumok = 0;
13668 if (PyNumber_Check(v)) {
13669 PyObject *iobj=NULL;
13670
13671 if (PyLong_Check(v)) {
13672 iobj = v;
13673 Py_INCREF(iobj);
13674 }
13675 else {
13676 iobj = PyNumber_Long(v);
13677 }
13678 if (iobj!=NULL) {
13679 if (PyLong_Check(iobj)) {
13680 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013681 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013682 Py_DECREF(iobj);
13683 if (!temp)
13684 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013685 if (PyUnicode_READY(temp) == -1) {
13686 Py_CLEAR(temp);
13687 goto onError;
13688 }
13689 pbuf = PyUnicode_DATA(temp);
13690 kind = PyUnicode_KIND(temp);
13691 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013692 sign = 1;
13693 }
13694 else {
13695 Py_DECREF(iobj);
13696 }
13697 }
13698 }
13699 if (!isnumok) {
13700 PyErr_Format(PyExc_TypeError,
13701 "%%%c format: a number is required, "
13702 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13703 goto onError;
13704 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013705 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013706 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013707 fillobj = zero;
13708 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013709 break;
13710
13711 case 'e':
13712 case 'E':
13713 case 'f':
13714 case 'F':
13715 case 'g':
13716 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013717 temp = formatfloat(v, flags, prec, c);
13718 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013719 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013720 if (PyUnicode_READY(temp) == -1) {
13721 Py_CLEAR(temp);
13722 goto onError;
13723 }
13724 pbuf = PyUnicode_DATA(temp);
13725 kind = PyUnicode_KIND(temp);
13726 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013727 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013728 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013729 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013730 fillobj = zero;
13731 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013732 break;
13733
13734 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013735 {
13736 Py_UCS4 ch = formatchar(v);
13737 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013738 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013739 temp = _PyUnicode_FromUCS4(&ch, 1);
13740 if (temp == NULL)
13741 goto onError;
13742 pbuf = PyUnicode_DATA(temp);
13743 kind = PyUnicode_KIND(temp);
13744 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013745 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013746 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013747
13748 default:
13749 PyErr_Format(PyExc_ValueError,
13750 "unsupported format character '%c' (0x%x) "
13751 "at index %zd",
13752 (31<=c && c<=126) ? (char)c : '?',
13753 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013754 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013755 goto onError;
13756 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013757 /* pbuf is initialized here. */
13758 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013759 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013760 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13761 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013762 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013763 pindex++;
13764 }
13765 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13766 signobj = plus;
13767 len--;
13768 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013769 }
13770 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013771 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013772 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013773 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013774 else
13775 sign = 0;
13776 }
13777 if (width < len)
13778 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013779 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013780 if (fill != ' ') {
13781 assert(signobj != NULL);
13782 if (_PyAccu_Accumulate(&acc, signobj))
13783 goto onError;
13784 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013785 if (width > len)
13786 width--;
13787 }
13788 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013789 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013790 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013791 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013792 second = get_latin1_char(
13793 PyUnicode_READ(kind, pbuf, pindex + 1));
13794 pindex += 2;
13795 if (second == NULL ||
13796 _PyAccu_Accumulate(&acc, zero) ||
13797 _PyAccu_Accumulate(&acc, second))
13798 goto onError;
13799 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013800 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013801 width -= 2;
13802 if (width < 0)
13803 width = 0;
13804 len -= 2;
13805 }
13806 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013807 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013808 if (repeat_accumulate(&acc, fillobj, width - len))
13809 goto onError;
13810 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013811 }
13812 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013813 if (sign) {
13814 assert(signobj != NULL);
13815 if (_PyAccu_Accumulate(&acc, signobj))
13816 goto onError;
13817 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013818 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013819 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13820 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013821 second = get_latin1_char(
13822 PyUnicode_READ(kind, pbuf, pindex + 1));
13823 pindex += 2;
13824 if (second == NULL ||
13825 _PyAccu_Accumulate(&acc, zero) ||
13826 _PyAccu_Accumulate(&acc, second))
13827 goto onError;
13828 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013829 }
13830 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013831 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013832 if (temp != NULL) {
13833 assert(pbuf == PyUnicode_DATA(temp));
13834 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013835 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013836 else {
13837 const char *p = (const char *) pbuf;
13838 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013839 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013840 v = PyUnicode_FromKindAndData(kind, p, len);
13841 }
13842 if (v == NULL)
13843 goto onError;
13844 r = _PyAccu_Accumulate(&acc, v);
13845 Py_DECREF(v);
13846 if (r)
13847 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013848 if (width > len && repeat_accumulate(&acc, blank, width - len))
13849 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 if (dict && (argidx < arglen) && c != '%') {
13851 PyErr_SetString(PyExc_TypeError,
13852 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013853 goto onError;
13854 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013855 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013856 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013857 } /* until end */
13858 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013859 PyErr_SetString(PyExc_TypeError,
13860 "not all arguments converted during string formatting");
13861 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013862 }
13863
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013864 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013865 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013866 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013867 }
13868 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013869 Py_XDECREF(temp);
13870 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013871 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013872
Benjamin Peterson29060642009-01-31 22:14:21 +000013873 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013874 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013875 Py_XDECREF(temp);
13876 Py_XDECREF(second);
13877 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013878 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013879 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013880 }
13881 return NULL;
13882}
13883
Jeremy Hylton938ace62002-07-17 16:30:39 +000013884static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013885unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13886
Tim Peters6d6c1a32001-08-02 04:15:00 +000013887static PyObject *
13888unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13889{
Benjamin Peterson29060642009-01-31 22:14:21 +000013890 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013891 static char *kwlist[] = {"object", "encoding", "errors", 0};
13892 char *encoding = NULL;
13893 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013894
Benjamin Peterson14339b62009-01-31 16:36:08 +000013895 if (type != &PyUnicode_Type)
13896 return unicode_subtype_new(type, args, kwds);
13897 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013898 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013899 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013900 if (x == NULL) {
13901 Py_INCREF(unicode_empty);
13902 return unicode_empty;
13903 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013904 if (encoding == NULL && errors == NULL)
13905 return PyObject_Str(x);
13906 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013907 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013908}
13909
Guido van Rossume023fe02001-08-30 03:12:59 +000013910static PyObject *
13911unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13912{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013913 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013914 Py_ssize_t length, char_size;
13915 int share_wstr, share_utf8;
13916 unsigned int kind;
13917 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013918
Benjamin Peterson14339b62009-01-31 16:36:08 +000013919 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013920
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013921 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013922 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013923 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013924 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013925 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013926 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013927 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013928 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013929
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013930 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013931 if (self == NULL) {
13932 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013933 return NULL;
13934 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013935 kind = PyUnicode_KIND(unicode);
13936 length = PyUnicode_GET_LENGTH(unicode);
13937
13938 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013939#ifdef Py_DEBUG
13940 _PyUnicode_HASH(self) = -1;
13941#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013942 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013943#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013944 _PyUnicode_STATE(self).interned = 0;
13945 _PyUnicode_STATE(self).kind = kind;
13946 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013947 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013948 _PyUnicode_STATE(self).ready = 1;
13949 _PyUnicode_WSTR(self) = NULL;
13950 _PyUnicode_UTF8_LENGTH(self) = 0;
13951 _PyUnicode_UTF8(self) = NULL;
13952 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013953 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013954
13955 share_utf8 = 0;
13956 share_wstr = 0;
13957 if (kind == PyUnicode_1BYTE_KIND) {
13958 char_size = 1;
13959 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13960 share_utf8 = 1;
13961 }
13962 else if (kind == PyUnicode_2BYTE_KIND) {
13963 char_size = 2;
13964 if (sizeof(wchar_t) == 2)
13965 share_wstr = 1;
13966 }
13967 else {
13968 assert(kind == PyUnicode_4BYTE_KIND);
13969 char_size = 4;
13970 if (sizeof(wchar_t) == 4)
13971 share_wstr = 1;
13972 }
13973
13974 /* Ensure we won't overflow the length. */
13975 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13976 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013977 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013978 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013979 data = PyObject_MALLOC((length + 1) * char_size);
13980 if (data == NULL) {
13981 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013982 goto onError;
13983 }
13984
Victor Stinnerc3c74152011-10-02 20:39:55 +020013985 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013986 if (share_utf8) {
13987 _PyUnicode_UTF8_LENGTH(self) = length;
13988 _PyUnicode_UTF8(self) = data;
13989 }
13990 if (share_wstr) {
13991 _PyUnicode_WSTR_LENGTH(self) = length;
13992 _PyUnicode_WSTR(self) = (wchar_t *)data;
13993 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013994
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013995 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013996 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013997 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013998#ifdef Py_DEBUG
13999 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14000#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014001 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014002 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014003
14004onError:
14005 Py_DECREF(unicode);
14006 Py_DECREF(self);
14007 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014008}
14009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014010PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014011 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014012\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014013Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014014encoding defaults to the current default string encoding.\n\
14015errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014016
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014017static PyObject *unicode_iter(PyObject *seq);
14018
Guido van Rossumd57fd912000-03-10 22:53:23 +000014019PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014020 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014021 "str", /* tp_name */
14022 sizeof(PyUnicodeObject), /* tp_size */
14023 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014024 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014025 (destructor)unicode_dealloc, /* tp_dealloc */
14026 0, /* tp_print */
14027 0, /* tp_getattr */
14028 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014029 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014030 unicode_repr, /* tp_repr */
14031 &unicode_as_number, /* tp_as_number */
14032 &unicode_as_sequence, /* tp_as_sequence */
14033 &unicode_as_mapping, /* tp_as_mapping */
14034 (hashfunc) unicode_hash, /* tp_hash*/
14035 0, /* tp_call*/
14036 (reprfunc) unicode_str, /* tp_str */
14037 PyObject_GenericGetAttr, /* tp_getattro */
14038 0, /* tp_setattro */
14039 0, /* tp_as_buffer */
14040 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014041 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 unicode_doc, /* tp_doc */
14043 0, /* tp_traverse */
14044 0, /* tp_clear */
14045 PyUnicode_RichCompare, /* tp_richcompare */
14046 0, /* tp_weaklistoffset */
14047 unicode_iter, /* tp_iter */
14048 0, /* tp_iternext */
14049 unicode_methods, /* tp_methods */
14050 0, /* tp_members */
14051 0, /* tp_getset */
14052 &PyBaseObject_Type, /* tp_base */
14053 0, /* tp_dict */
14054 0, /* tp_descr_get */
14055 0, /* tp_descr_set */
14056 0, /* tp_dictoffset */
14057 0, /* tp_init */
14058 0, /* tp_alloc */
14059 unicode_new, /* tp_new */
14060 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014061};
14062
14063/* Initialize the Unicode implementation */
14064
Victor Stinner3a50e702011-10-18 21:21:00 +020014065int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014066{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014067 int i;
14068
Thomas Wouters477c8d52006-05-27 19:21:47 +000014069 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014070 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014071 0x000A, /* LINE FEED */
14072 0x000D, /* CARRIAGE RETURN */
14073 0x001C, /* FILE SEPARATOR */
14074 0x001D, /* GROUP SEPARATOR */
14075 0x001E, /* RECORD SEPARATOR */
14076 0x0085, /* NEXT LINE */
14077 0x2028, /* LINE SEPARATOR */
14078 0x2029, /* PARAGRAPH SEPARATOR */
14079 };
14080
Fred Drakee4315f52000-05-09 19:53:39 +000014081 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014082 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014083 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014084 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014085 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014086
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014087 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014088 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014089 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014090 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014091
14092 /* initialize the linebreak bloom filter */
14093 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014094 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014095 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014096
14097 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014098
14099#ifdef HAVE_MBCS
14100 winver.dwOSVersionInfoSize = sizeof(winver);
14101 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14102 PyErr_SetFromWindowsErr(0);
14103 return -1;
14104 }
14105#endif
14106 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014107}
14108
14109/* Finalize the Unicode implementation */
14110
Christian Heimesa156e092008-02-16 07:38:31 +000014111int
14112PyUnicode_ClearFreeList(void)
14113{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014114 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014115}
14116
Guido van Rossumd57fd912000-03-10 22:53:23 +000014117void
Thomas Wouters78890102000-07-22 19:25:51 +000014118_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014119{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014120 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014121
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014122 Py_XDECREF(unicode_empty);
14123 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014124
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014125 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014126 if (unicode_latin1[i]) {
14127 Py_DECREF(unicode_latin1[i]);
14128 unicode_latin1[i] = NULL;
14129 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014130 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014131 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014132 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014133}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014134
Walter Dörwald16807132007-05-25 13:52:07 +000014135void
14136PyUnicode_InternInPlace(PyObject **p)
14137{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014138 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014139 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014140#ifdef Py_DEBUG
14141 assert(s != NULL);
14142 assert(_PyUnicode_CHECK(s));
14143#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014144 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014145 return;
14146#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014147 /* If it's a subclass, we don't really know what putting
14148 it in the interned dict might do. */
14149 if (!PyUnicode_CheckExact(s))
14150 return;
14151 if (PyUnicode_CHECK_INTERNED(s))
14152 return;
14153 if (interned == NULL) {
14154 interned = PyDict_New();
14155 if (interned == NULL) {
14156 PyErr_Clear(); /* Don't leave an exception */
14157 return;
14158 }
14159 }
14160 /* It might be that the GetItem call fails even
14161 though the key is present in the dictionary,
14162 namely when this happens during a stack overflow. */
14163 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014164 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014165 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014166
Benjamin Peterson29060642009-01-31 22:14:21 +000014167 if (t) {
14168 Py_INCREF(t);
14169 Py_DECREF(*p);
14170 *p = t;
14171 return;
14172 }
Walter Dörwald16807132007-05-25 13:52:07 +000014173
Benjamin Peterson14339b62009-01-31 16:36:08 +000014174 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014175 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014176 PyErr_Clear();
14177 PyThreadState_GET()->recursion_critical = 0;
14178 return;
14179 }
14180 PyThreadState_GET()->recursion_critical = 0;
14181 /* The two references in interned are not counted by refcnt.
14182 The deallocator will take care of this */
14183 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014184 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014185}
14186
14187void
14188PyUnicode_InternImmortal(PyObject **p)
14189{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014190 PyUnicode_InternInPlace(p);
14191 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014192 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014193 Py_INCREF(*p);
14194 }
Walter Dörwald16807132007-05-25 13:52:07 +000014195}
14196
14197PyObject *
14198PyUnicode_InternFromString(const char *cp)
14199{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014200 PyObject *s = PyUnicode_FromString(cp);
14201 if (s == NULL)
14202 return NULL;
14203 PyUnicode_InternInPlace(&s);
14204 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014205}
14206
Alexander Belopolsky40018472011-02-26 01:02:56 +000014207void
14208_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014209{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014210 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014211 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014212 Py_ssize_t i, n;
14213 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014214
Benjamin Peterson14339b62009-01-31 16:36:08 +000014215 if (interned == NULL || !PyDict_Check(interned))
14216 return;
14217 keys = PyDict_Keys(interned);
14218 if (keys == NULL || !PyList_Check(keys)) {
14219 PyErr_Clear();
14220 return;
14221 }
Walter Dörwald16807132007-05-25 13:52:07 +000014222
Benjamin Peterson14339b62009-01-31 16:36:08 +000014223 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14224 detector, interned unicode strings are not forcibly deallocated;
14225 rather, we give them their stolen references back, and then clear
14226 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014227
Benjamin Peterson14339b62009-01-31 16:36:08 +000014228 n = PyList_GET_SIZE(keys);
14229 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014230 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014231 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014232 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014233 if (PyUnicode_READY(s) == -1) {
14234 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014235 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014237 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014238 case SSTATE_NOT_INTERNED:
14239 /* XXX Shouldn't happen */
14240 break;
14241 case SSTATE_INTERNED_IMMORTAL:
14242 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014243 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014244 break;
14245 case SSTATE_INTERNED_MORTAL:
14246 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014247 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014248 break;
14249 default:
14250 Py_FatalError("Inconsistent interned string state.");
14251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014252 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014253 }
14254 fprintf(stderr, "total size of all interned strings: "
14255 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14256 "mortal/immortal\n", mortal_size, immortal_size);
14257 Py_DECREF(keys);
14258 PyDict_Clear(interned);
14259 Py_DECREF(interned);
14260 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014261}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014262
14263
14264/********************* Unicode Iterator **************************/
14265
14266typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014267 PyObject_HEAD
14268 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014269 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014270} unicodeiterobject;
14271
14272static void
14273unicodeiter_dealloc(unicodeiterobject *it)
14274{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014275 _PyObject_GC_UNTRACK(it);
14276 Py_XDECREF(it->it_seq);
14277 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014278}
14279
14280static int
14281unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14282{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014283 Py_VISIT(it->it_seq);
14284 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014285}
14286
14287static PyObject *
14288unicodeiter_next(unicodeiterobject *it)
14289{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014290 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014291
Benjamin Peterson14339b62009-01-31 16:36:08 +000014292 assert(it != NULL);
14293 seq = it->it_seq;
14294 if (seq == NULL)
14295 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014296 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014298 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14299 int kind = PyUnicode_KIND(seq);
14300 void *data = PyUnicode_DATA(seq);
14301 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14302 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014303 if (item != NULL)
14304 ++it->it_index;
14305 return item;
14306 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014307
Benjamin Peterson14339b62009-01-31 16:36:08 +000014308 Py_DECREF(seq);
14309 it->it_seq = NULL;
14310 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014311}
14312
14313static PyObject *
14314unicodeiter_len(unicodeiterobject *it)
14315{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014316 Py_ssize_t len = 0;
14317 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014318 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014319 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014320}
14321
14322PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14323
14324static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014325 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014326 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014327 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014328};
14329
14330PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014331 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14332 "str_iterator", /* tp_name */
14333 sizeof(unicodeiterobject), /* tp_basicsize */
14334 0, /* tp_itemsize */
14335 /* methods */
14336 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14337 0, /* tp_print */
14338 0, /* tp_getattr */
14339 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014340 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014341 0, /* tp_repr */
14342 0, /* tp_as_number */
14343 0, /* tp_as_sequence */
14344 0, /* tp_as_mapping */
14345 0, /* tp_hash */
14346 0, /* tp_call */
14347 0, /* tp_str */
14348 PyObject_GenericGetAttr, /* tp_getattro */
14349 0, /* tp_setattro */
14350 0, /* tp_as_buffer */
14351 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14352 0, /* tp_doc */
14353 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14354 0, /* tp_clear */
14355 0, /* tp_richcompare */
14356 0, /* tp_weaklistoffset */
14357 PyObject_SelfIter, /* tp_iter */
14358 (iternextfunc)unicodeiter_next, /* tp_iternext */
14359 unicodeiter_methods, /* tp_methods */
14360 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014361};
14362
14363static PyObject *
14364unicode_iter(PyObject *seq)
14365{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014366 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014367
Benjamin Peterson14339b62009-01-31 16:36:08 +000014368 if (!PyUnicode_Check(seq)) {
14369 PyErr_BadInternalCall();
14370 return NULL;
14371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014372 if (PyUnicode_READY(seq) == -1)
14373 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014374 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14375 if (it == NULL)
14376 return NULL;
14377 it->it_index = 0;
14378 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014379 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014380 _PyObject_GC_TRACK(it);
14381 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014382}
14383
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014384
14385size_t
14386Py_UNICODE_strlen(const Py_UNICODE *u)
14387{
14388 int res = 0;
14389 while(*u++)
14390 res++;
14391 return res;
14392}
14393
14394Py_UNICODE*
14395Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14396{
14397 Py_UNICODE *u = s1;
14398 while ((*u++ = *s2++));
14399 return s1;
14400}
14401
14402Py_UNICODE*
14403Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14404{
14405 Py_UNICODE *u = s1;
14406 while ((*u++ = *s2++))
14407 if (n-- == 0)
14408 break;
14409 return s1;
14410}
14411
14412Py_UNICODE*
14413Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14414{
14415 Py_UNICODE *u1 = s1;
14416 u1 += Py_UNICODE_strlen(u1);
14417 Py_UNICODE_strcpy(u1, s2);
14418 return s1;
14419}
14420
14421int
14422Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14423{
14424 while (*s1 && *s2 && *s1 == *s2)
14425 s1++, s2++;
14426 if (*s1 && *s2)
14427 return (*s1 < *s2) ? -1 : +1;
14428 if (*s1)
14429 return 1;
14430 if (*s2)
14431 return -1;
14432 return 0;
14433}
14434
14435int
14436Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14437{
14438 register Py_UNICODE u1, u2;
14439 for (; n != 0; n--) {
14440 u1 = *s1;
14441 u2 = *s2;
14442 if (u1 != u2)
14443 return (u1 < u2) ? -1 : +1;
14444 if (u1 == '\0')
14445 return 0;
14446 s1++;
14447 s2++;
14448 }
14449 return 0;
14450}
14451
14452Py_UNICODE*
14453Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14454{
14455 const Py_UNICODE *p;
14456 for (p = s; *p; p++)
14457 if (*p == c)
14458 return (Py_UNICODE*)p;
14459 return NULL;
14460}
14461
14462Py_UNICODE*
14463Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14464{
14465 const Py_UNICODE *p;
14466 p = s + Py_UNICODE_strlen(s);
14467 while (p != s) {
14468 p--;
14469 if (*p == c)
14470 return (Py_UNICODE*)p;
14471 }
14472 return NULL;
14473}
Victor Stinner331ea922010-08-10 16:37:20 +000014474
Victor Stinner71133ff2010-09-01 23:43:53 +000014475Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014476PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014477{
Victor Stinner577db2c2011-10-11 22:12:48 +020014478 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014479 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014481 if (!PyUnicode_Check(unicode)) {
14482 PyErr_BadArgument();
14483 return NULL;
14484 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014485 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014486 if (u == NULL)
14487 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014488 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014489 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014490 PyErr_NoMemory();
14491 return NULL;
14492 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014493 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014494 size *= sizeof(Py_UNICODE);
14495 copy = PyMem_Malloc(size);
14496 if (copy == NULL) {
14497 PyErr_NoMemory();
14498 return NULL;
14499 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014500 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014501 return copy;
14502}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014503
Georg Brandl66c221e2010-10-14 07:04:07 +000014504/* A _string module, to export formatter_parser and formatter_field_name_split
14505 to the string.Formatter class implemented in Python. */
14506
14507static PyMethodDef _string_methods[] = {
14508 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14509 METH_O, PyDoc_STR("split the argument as a field name")},
14510 {"formatter_parser", (PyCFunction) formatter_parser,
14511 METH_O, PyDoc_STR("parse the argument as a format string")},
14512 {NULL, NULL}
14513};
14514
14515static struct PyModuleDef _string_module = {
14516 PyModuleDef_HEAD_INIT,
14517 "_string",
14518 PyDoc_STR("string helper module"),
14519 0,
14520 _string_methods,
14521 NULL,
14522 NULL,
14523 NULL,
14524 NULL
14525};
14526
14527PyMODINIT_FUNC
14528PyInit__string(void)
14529{
14530 return PyModule_Create(&_string_module);
14531}
14532
14533
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014534#ifdef __cplusplus
14535}
14536#endif