blob: 60b0a1fbbd9faeeca003c63fe4e4400a8dad91c2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200378 void *data;
379 Py_UCS4 ch;
380
381 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 for (i=0; i < ascii->length; i++)
383 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200384 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200385 if (ch > maxchar)
386 maxchar = ch;
387 }
388 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100389 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100391 assert(maxchar <= 255);
392 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 else
394 assert(maxchar < 128);
395 }
Victor Stinner77faf692011-11-20 18:56:05 +0100396 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100398 assert(maxchar <= 0xFFFF);
399 }
400 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100402 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100403 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200404 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400406 return 1;
407}
Victor Stinner910337b2011-10-03 03:20:16 +0200408#endif
409
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100410static PyObject*
411unicode_result_wchar(PyObject *unicode)
412{
413#ifndef Py_DEBUG
414 Py_ssize_t len;
415
416 assert(Py_REFCNT(unicode) == 1);
417
418 len = _PyUnicode_WSTR_LENGTH(unicode);
419 if (len == 0) {
420 Py_INCREF(unicode_empty);
421 Py_DECREF(unicode);
422 return unicode_empty;
423 }
424
425 if (len == 1) {
426 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
427 if (ch < 256) {
428 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
429 Py_DECREF(unicode);
430 return latin1_char;
431 }
432 }
433
434 if (_PyUnicode_Ready(unicode) < 0) {
435 Py_XDECREF(unicode);
436 return NULL;
437 }
438#else
439 /* don't make the result ready in debug mode to ensure that the caller
440 makes the string ready before using it */
441 assert(_PyUnicode_CheckConsistency(unicode, 1));
442#endif
443 return unicode;
444}
445
446static PyObject*
447unicode_result_ready(PyObject *unicode)
448{
449 Py_ssize_t length;
450
451 length = PyUnicode_GET_LENGTH(unicode);
452 if (length == 0) {
453 if (unicode != unicode_empty) {
454 Py_INCREF(unicode_empty);
455 Py_DECREF(unicode);
456 }
457 return unicode_empty;
458 }
459
460 if (length == 1) {
461 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
462 if (ch < 256) {
463 PyObject *latin1_char = unicode_latin1[ch];
464 if (latin1_char != NULL) {
465 if (unicode != latin1_char) {
466 Py_INCREF(latin1_char);
467 Py_DECREF(unicode);
468 }
469 return latin1_char;
470 }
471 else {
472 assert(_PyUnicode_CheckConsistency(unicode, 1));
473 Py_INCREF(unicode);
474 unicode_latin1[ch] = unicode;
475 return unicode;
476 }
477 }
478 }
479
480 assert(_PyUnicode_CheckConsistency(unicode, 1));
481 return unicode;
482}
483
484static PyObject*
485unicode_result(PyObject *unicode)
486{
487 assert(_PyUnicode_CHECK(unicode));
488 if (PyUnicode_IS_READY(unicode))
489 return unicode_result_ready(unicode);
490 else
491 return unicode_result_wchar(unicode);
492}
493
Victor Stinnerc4b49542011-12-11 22:44:26 +0100494static PyObject*
495unicode_result_unchanged(PyObject *unicode)
496{
497 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500498 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100499 return NULL;
500 Py_INCREF(unicode);
501 return unicode;
502 }
503 else
504 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100505 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100506}
507
Victor Stinner3a50e702011-10-18 21:21:00 +0200508#ifdef HAVE_MBCS
509static OSVERSIONINFOEX winver;
510#endif
511
Thomas Wouters477c8d52006-05-27 19:21:47 +0000512/* --- Bloom Filters ----------------------------------------------------- */
513
514/* stuff to implement simple "bloom filters" for Unicode characters.
515 to keep things simple, we use a single bitmask, using the least 5
516 bits from each unicode characters as the bit index. */
517
518/* the linebreak mask is set up by Unicode_Init below */
519
Antoine Pitrouf068f942010-01-13 14:19:12 +0000520#if LONG_BIT >= 128
521#define BLOOM_WIDTH 128
522#elif LONG_BIT >= 64
523#define BLOOM_WIDTH 64
524#elif LONG_BIT >= 32
525#define BLOOM_WIDTH 32
526#else
527#error "LONG_BIT is smaller than 32"
528#endif
529
Thomas Wouters477c8d52006-05-27 19:21:47 +0000530#define BLOOM_MASK unsigned long
531
532static BLOOM_MASK bloom_linebreak;
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
535#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Benjamin Peterson29060642009-01-31 22:14:21 +0000537#define BLOOM_LINEBREAK(ch) \
538 ((ch) < 128U ? ascii_linebreak[(ch)] : \
539 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000540
Alexander Belopolsky40018472011-02-26 01:02:56 +0000541Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543{
544 /* calculate simple bloom-style bitmask for a given unicode string */
545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547 Py_ssize_t i;
548
549 mask = 0;
550 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552
553 return mask;
554}
555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556#define BLOOM_MEMBER(mask, chr, str) \
557 (BLOOM(mask, chr) \
558 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000559
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200560/* Compilation of templated routines */
561
562#include "stringlib/asciilib.h"
563#include "stringlib/fastsearch.h"
564#include "stringlib/partition.h"
565#include "stringlib/split.h"
566#include "stringlib/count.h"
567#include "stringlib/find.h"
568#include "stringlib/find_max_char.h"
569#include "stringlib/localeutil.h"
570#include "stringlib/undef.h"
571
572#include "stringlib/ucs1lib.h"
573#include "stringlib/fastsearch.h"
574#include "stringlib/partition.h"
575#include "stringlib/split.h"
576#include "stringlib/count.h"
577#include "stringlib/find.h"
578#include "stringlib/find_max_char.h"
579#include "stringlib/localeutil.h"
580#include "stringlib/undef.h"
581
582#include "stringlib/ucs2lib.h"
583#include "stringlib/fastsearch.h"
584#include "stringlib/partition.h"
585#include "stringlib/split.h"
586#include "stringlib/count.h"
587#include "stringlib/find.h"
588#include "stringlib/find_max_char.h"
589#include "stringlib/localeutil.h"
590#include "stringlib/undef.h"
591
592#include "stringlib/ucs4lib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200602#include "stringlib/unicodedefs.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/count.h"
605#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100606#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200607
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608/* --- Unicode Object ----------------------------------------------------- */
609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200611fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
614 Py_ssize_t size, Py_UCS4 ch,
615 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200616{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200617 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
618
619 switch (kind) {
620 case PyUnicode_1BYTE_KIND:
621 {
622 Py_UCS1 ch1 = (Py_UCS1) ch;
623 if (ch1 == ch)
624 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
625 else
626 return -1;
627 }
628 case PyUnicode_2BYTE_KIND:
629 {
630 Py_UCS2 ch2 = (Py_UCS2) ch;
631 if (ch2 == ch)
632 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
633 else
634 return -1;
635 }
636 case PyUnicode_4BYTE_KIND:
637 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
638 default:
639 assert(0);
640 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200642}
643
Victor Stinnerfe226c02011-10-03 03:52:20 +0200644static PyObject*
645resize_compact(PyObject *unicode, Py_ssize_t length)
646{
647 Py_ssize_t char_size;
648 Py_ssize_t struct_size;
649 Py_ssize_t new_size;
650 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100651 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100653 assert(PyUnicode_IS_COMPACT(unicode));
654
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200655 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100656 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 struct_size = sizeof(PyASCIIObject);
658 else
659 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200660 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
663 PyErr_NoMemory();
664 return NULL;
665 }
666 new_size = (struct_size + (length + 1) * char_size);
667
Victor Stinner84def372011-12-11 20:04:56 +0100668 _Py_DEC_REFTOTAL;
669 _Py_ForgetReference(unicode);
670
671 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
672 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100673 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 PyErr_NoMemory();
675 return NULL;
676 }
Victor Stinner84def372011-12-11 20:04:56 +0100677 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100679
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200681 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100683 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200684 _PyUnicode_WSTR_LENGTH(unicode) = length;
685 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
687 length, 0);
688 return unicode;
689}
690
Alexander Belopolsky40018472011-02-26 01:02:56 +0000691static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200692resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693{
Victor Stinner95663112011-10-04 01:03:50 +0200694 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100695 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200696 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000698
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699 if (PyUnicode_IS_READY(unicode)) {
700 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200701 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 void *data;
703
704 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200705 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200706 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
707 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200708
709 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
710 PyErr_NoMemory();
711 return -1;
712 }
713 new_size = (length + 1) * char_size;
714
Victor Stinner7a9105a2011-12-12 00:13:42 +0100715 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
716 {
717 PyObject_DEL(_PyUnicode_UTF8(unicode));
718 _PyUnicode_UTF8(unicode) = NULL;
719 _PyUnicode_UTF8_LENGTH(unicode) = 0;
720 }
721
Victor Stinnerfe226c02011-10-03 03:52:20 +0200722 data = (PyObject *)PyObject_REALLOC(data, new_size);
723 if (data == NULL) {
724 PyErr_NoMemory();
725 return -1;
726 }
727 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200728 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200729 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_WSTR_LENGTH(unicode) = length;
731 }
732 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200733 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200734 _PyUnicode_UTF8_LENGTH(unicode) = length;
735 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_LENGTH(unicode) = length;
737 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200738 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200739 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 }
Victor Stinner95663112011-10-04 01:03:50 +0200743 assert(_PyUnicode_WSTR(unicode) != NULL);
744
745 /* check for integer overflow */
746 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
747 PyErr_NoMemory();
748 return -1;
749 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100750 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200751 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100752 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200753 if (!wstr) {
754 PyErr_NoMemory();
755 return -1;
756 }
757 _PyUnicode_WSTR(unicode) = wstr;
758 _PyUnicode_WSTR(unicode)[length] = 0;
759 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200760 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761 return 0;
762}
763
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764static PyObject*
765resize_copy(PyObject *unicode, Py_ssize_t length)
766{
767 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100770
Benjamin Petersonbac79492012-01-14 13:34:47 -0500771 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100772 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
775 if (copy == NULL)
776 return NULL;
777
778 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200779 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200780 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200781 }
782 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200783 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100784
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200785 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200786 if (w == NULL)
787 return NULL;
788 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
789 copy_length = Py_MIN(copy_length, length);
790 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
791 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200792 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793 }
794}
795
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000797 Ux0000 terminated; some code (e.g. new_identifier)
798 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000799
800 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000801 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802
803*/
804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200805#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200806static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807#endif
808
Alexander Belopolsky40018472011-02-26 01:02:56 +0000809static PyUnicodeObject *
810_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811{
812 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814
Thomas Wouters477c8d52006-05-27 19:21:47 +0000815 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816 if (length == 0 && unicode_empty != NULL) {
817 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200818 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819 }
820
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000821 /* Ensure we won't overflow the size. */
822 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
823 return (PyUnicodeObject *)PyErr_NoMemory();
824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200825 if (length < 0) {
826 PyErr_SetString(PyExc_SystemError,
827 "Negative size passed to _PyUnicode_New");
828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829 }
830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831#ifdef Py_DEBUG
832 ++unicode_old_new_calls;
833#endif
834
835 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
836 if (unicode == NULL)
837 return NULL;
838 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
839 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
840 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100841 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000842 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100843 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200845
Jeremy Hyltond8082792003-09-16 19:41:39 +0000846 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000847 * the caller fails before initializing str -- unicode_resize()
848 * reads str[0], and the Keep-Alive optimization can keep memory
849 * allocated for str alive across a call to unicode_dealloc(unicode).
850 * We don't want unicode_resize to read uninitialized memory in
851 * that case.
852 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200853 _PyUnicode_WSTR(unicode)[0] = 0;
854 _PyUnicode_WSTR(unicode)[length] = 0;
855 _PyUnicode_WSTR_LENGTH(unicode) = length;
856 _PyUnicode_HASH(unicode) = -1;
857 _PyUnicode_STATE(unicode).interned = 0;
858 _PyUnicode_STATE(unicode).kind = 0;
859 _PyUnicode_STATE(unicode).compact = 0;
860 _PyUnicode_STATE(unicode).ready = 0;
861 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200862 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200863 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200864 _PyUnicode_UTF8(unicode) = NULL;
865 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100866 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867 return unicode;
868}
869
Victor Stinnerf42dc442011-10-02 23:33:16 +0200870static const char*
871unicode_kind_name(PyObject *unicode)
872{
Victor Stinner42dfd712011-10-03 14:41:45 +0200873 /* don't check consistency: unicode_kind_name() is called from
874 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200875 if (!PyUnicode_IS_COMPACT(unicode))
876 {
877 if (!PyUnicode_IS_READY(unicode))
878 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600879 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200880 {
881 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200882 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200883 return "legacy ascii";
884 else
885 return "legacy latin1";
886 case PyUnicode_2BYTE_KIND:
887 return "legacy UCS2";
888 case PyUnicode_4BYTE_KIND:
889 return "legacy UCS4";
890 default:
891 return "<legacy invalid kind>";
892 }
893 }
894 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600895 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200897 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200898 return "ascii";
899 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200902 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200903 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200904 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200905 default:
906 return "<invalid compact kind>";
907 }
908}
909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200910#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200911static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912
913/* Functions wrapping macros for use in debugger */
914char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200915 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200916}
917
918void *_PyUnicode_compact_data(void *unicode) {
919 return _PyUnicode_COMPACT_DATA(unicode);
920}
921void *_PyUnicode_data(void *unicode){
922 printf("obj %p\n", unicode);
923 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
924 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
925 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
926 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
927 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
928 return PyUnicode_DATA(unicode);
929}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200930
931void
932_PyUnicode_Dump(PyObject *op)
933{
934 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
936 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
937 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200938
Victor Stinnera849a4b2011-10-03 12:12:11 +0200939 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200940 {
941 if (ascii->state.ascii)
942 data = (ascii + 1);
943 else
944 data = (compact + 1);
945 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 else
947 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200948 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
949
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 if (ascii->wstr == data)
951 printf("shared ");
952 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200953
Victor Stinnera3b334d2011-10-03 13:53:37 +0200954 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 printf(" (%zu), ", compact->wstr_length);
956 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
957 printf("shared ");
958 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200959 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200960 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200961}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200962#endif
963
964PyObject *
965PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
966{
967 PyObject *obj;
968 PyCompactUnicodeObject *unicode;
969 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200970 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200971 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 Py_ssize_t char_size;
973 Py_ssize_t struct_size;
974
975 /* Optimization for empty strings */
976 if (size == 0 && unicode_empty != NULL) {
977 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200978 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 }
980
981#ifdef Py_DEBUG
982 ++unicode_new_new_calls;
983#endif
984
Victor Stinner9e9d6892011-10-04 01:02:02 +0200985 is_ascii = 0;
986 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 struct_size = sizeof(PyCompactUnicodeObject);
988 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200989 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 char_size = 1;
991 is_ascii = 1;
992 struct_size = sizeof(PyASCIIObject);
993 }
994 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +0200995 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200996 char_size = 1;
997 }
998 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +0200999 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000 char_size = 2;
1001 if (sizeof(wchar_t) == 2)
1002 is_sharing = 1;
1003 }
1004 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001005 if (maxchar > MAX_UNICODE) {
1006 PyErr_SetString(PyExc_SystemError,
1007 "invalid maximum character passed to PyUnicode_New");
1008 return NULL;
1009 }
Victor Stinner8f825062012-04-27 13:55:39 +02001010 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001011 char_size = 4;
1012 if (sizeof(wchar_t) == 4)
1013 is_sharing = 1;
1014 }
1015
1016 /* Ensure we won't overflow the size. */
1017 if (size < 0) {
1018 PyErr_SetString(PyExc_SystemError,
1019 "Negative size passed to PyUnicode_New");
1020 return NULL;
1021 }
1022 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1023 return PyErr_NoMemory();
1024
1025 /* Duplicated allocation code from _PyObject_New() instead of a call to
1026 * PyObject_New() so we are able to allocate space for the object and
1027 * it's data buffer.
1028 */
1029 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1030 if (obj == NULL)
1031 return PyErr_NoMemory();
1032 obj = PyObject_INIT(obj, &PyUnicode_Type);
1033 if (obj == NULL)
1034 return NULL;
1035
1036 unicode = (PyCompactUnicodeObject *)obj;
1037 if (is_ascii)
1038 data = ((PyASCIIObject*)obj) + 1;
1039 else
1040 data = unicode + 1;
1041 _PyUnicode_LENGTH(unicode) = size;
1042 _PyUnicode_HASH(unicode) = -1;
1043 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001044 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 _PyUnicode_STATE(unicode).compact = 1;
1046 _PyUnicode_STATE(unicode).ready = 1;
1047 _PyUnicode_STATE(unicode).ascii = is_ascii;
1048 if (is_ascii) {
1049 ((char*)data)[size] = 0;
1050 _PyUnicode_WSTR(unicode) = NULL;
1051 }
Victor Stinner8f825062012-04-27 13:55:39 +02001052 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 ((char*)data)[size] = 0;
1054 _PyUnicode_WSTR(unicode) = NULL;
1055 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001057 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 else {
1060 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001061 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001062 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001064 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 ((Py_UCS4*)data)[size] = 0;
1066 if (is_sharing) {
1067 _PyUnicode_WSTR_LENGTH(unicode) = size;
1068 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1069 }
1070 else {
1071 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1072 _PyUnicode_WSTR(unicode) = NULL;
1073 }
1074 }
Victor Stinner8f825062012-04-27 13:55:39 +02001075#ifdef Py_DEBUG
1076 /* Fill the data with invalid characters to detect bugs earlier.
1077 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1078 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1079 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1080 memset(data, 0xff, size * kind);
1081#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001082 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083 return obj;
1084}
1085
1086#if SIZEOF_WCHAR_T == 2
1087/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1088 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001089 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090
1091 This function assumes that unicode can hold one more code point than wstr
1092 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001093static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001095 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096{
1097 const wchar_t *iter;
1098 Py_UCS4 *ucs4_out;
1099
Victor Stinner910337b2011-10-03 03:20:16 +02001100 assert(unicode != NULL);
1101 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1103 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1104
1105 for (iter = begin; iter < end; ) {
1106 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1107 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001108 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1109 && (iter+1) < end
1110 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 {
Victor Stinner551ac952011-11-29 22:58:13 +01001112 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 iter += 2;
1114 }
1115 else {
1116 *ucs4_out++ = *iter;
1117 iter++;
1118 }
1119 }
1120 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1121 _PyUnicode_GET_LENGTH(unicode)));
1122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123}
1124#endif
1125
Victor Stinnercd9950f2011-10-02 00:34:53 +02001126static int
Victor Stinner488fa492011-12-12 00:01:39 +01001127unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001128{
Victor Stinner488fa492011-12-12 00:01:39 +01001129 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001130 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001131 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001132 return -1;
1133 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001134 return 0;
1135}
1136
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001137static int
1138_copy_characters(PyObject *to, Py_ssize_t to_start,
1139 PyObject *from, Py_ssize_t from_start,
1140 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001142 unsigned int from_kind, to_kind;
1143 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001146 assert(PyUnicode_Check(from));
1147 assert(PyUnicode_Check(to));
1148 assert(PyUnicode_IS_READY(from));
1149 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001151 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1152 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1153 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001155 if (how_many == 0)
1156 return 0;
1157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001159 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001161 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001163#ifdef Py_DEBUG
1164 if (!check_maxchar
1165 && (from_kind > to_kind
1166 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001168 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1169 Py_UCS4 ch;
1170 Py_ssize_t i;
1171 for (i=0; i < how_many; i++) {
1172 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1173 assert(ch <= to_maxchar);
1174 }
1175 }
1176#endif
1177 fast = (from_kind == to_kind);
1178 if (check_maxchar
1179 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1180 {
1181 /* deny latin1 => ascii */
1182 fast = 0;
1183 }
1184
1185 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001186 Py_MEMCPY((char*)to_data + to_kind * to_start,
1187 (char*)from_data + from_kind * from_start,
1188 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001190 else if (from_kind == PyUnicode_1BYTE_KIND
1191 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001192 {
1193 _PyUnicode_CONVERT_BYTES(
1194 Py_UCS1, Py_UCS2,
1195 PyUnicode_1BYTE_DATA(from) + from_start,
1196 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1197 PyUnicode_2BYTE_DATA(to) + to_start
1198 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001199 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001200 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001201 && to_kind == PyUnicode_4BYTE_KIND)
1202 {
1203 _PyUnicode_CONVERT_BYTES(
1204 Py_UCS1, Py_UCS4,
1205 PyUnicode_1BYTE_DATA(from) + from_start,
1206 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1207 PyUnicode_4BYTE_DATA(to) + to_start
1208 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001209 }
1210 else if (from_kind == PyUnicode_2BYTE_KIND
1211 && to_kind == PyUnicode_4BYTE_KIND)
1212 {
1213 _PyUnicode_CONVERT_BYTES(
1214 Py_UCS2, Py_UCS4,
1215 PyUnicode_2BYTE_DATA(from) + from_start,
1216 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1217 PyUnicode_4BYTE_DATA(to) + to_start
1218 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001219 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001221 /* check if max_char(from substring) <= max_char(to) */
1222 if (from_kind > to_kind
1223 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001224 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001225 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001226 /* slow path to check for character overflow */
1227 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001228 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001229 Py_ssize_t i;
1230
Victor Stinner56c161a2011-10-06 02:47:11 +02001231#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001232 for (i=0; i < how_many; i++) {
1233 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001234 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001235 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1236 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001237#else
1238 if (!check_maxchar) {
1239 for (i=0; i < how_many; i++) {
1240 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1241 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1242 }
1243 }
1244 else {
1245 for (i=0; i < how_many; i++) {
1246 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1247 if (ch > to_maxchar)
1248 return 1;
1249 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1250 }
1251 }
1252#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001253 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001254 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001255 assert(0 && "inconsistent state");
1256 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001257 }
1258 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001259 return 0;
1260}
1261
1262static void
1263copy_characters(PyObject *to, Py_ssize_t to_start,
1264 PyObject *from, Py_ssize_t from_start,
1265 Py_ssize_t how_many)
1266{
1267 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1268}
1269
1270Py_ssize_t
1271PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1272 PyObject *from, Py_ssize_t from_start,
1273 Py_ssize_t how_many)
1274{
1275 int err;
1276
1277 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1278 PyErr_BadInternalCall();
1279 return -1;
1280 }
1281
Benjamin Petersonbac79492012-01-14 13:34:47 -05001282 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001283 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001284 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001285 return -1;
1286
1287 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1288 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1289 PyErr_Format(PyExc_SystemError,
1290 "Cannot write %zi characters at %zi "
1291 "in a string of %zi characters",
1292 how_many, to_start, PyUnicode_GET_LENGTH(to));
1293 return -1;
1294 }
1295
1296 if (how_many == 0)
1297 return 0;
1298
Victor Stinner488fa492011-12-12 00:01:39 +01001299 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001300 return -1;
1301
1302 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1303 if (err) {
1304 PyErr_Format(PyExc_SystemError,
1305 "Cannot copy %s characters "
1306 "into a string of %s characters",
1307 unicode_kind_name(from),
1308 unicode_kind_name(to));
1309 return -1;
1310 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001311 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312}
1313
Victor Stinner17222162011-09-28 22:15:37 +02001314/* Find the maximum code point and count the number of surrogate pairs so a
1315 correct string length can be computed before converting a string to UCS4.
1316 This function counts single surrogates as a character and not as a pair.
1317
1318 Return 0 on success, or -1 on error. */
1319static int
1320find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1321 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322{
1323 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001324 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325
Victor Stinnerc53be962011-10-02 21:33:54 +02001326 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 *num_surrogates = 0;
1328 *maxchar = 0;
1329
1330 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001332 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1333 && (iter+1) < end
1334 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001336 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 iter += 2;
1339 }
1340 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001342 {
1343 ch = *iter;
1344 iter++;
1345 }
1346 if (ch > *maxchar) {
1347 *maxchar = ch;
1348 if (*maxchar > MAX_UNICODE) {
1349 PyErr_Format(PyExc_ValueError,
1350 "character U+%x is not in range [U+0000; U+10ffff]",
1351 ch);
1352 return -1;
1353 }
1354 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355 }
1356 return 0;
1357}
1358
1359#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001360static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361#endif
1362
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001363int
1364_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365{
1366 wchar_t *end;
1367 Py_UCS4 maxchar = 0;
1368 Py_ssize_t num_surrogates;
1369#if SIZEOF_WCHAR_T == 2
1370 Py_ssize_t length_wo_surrogates;
1371#endif
1372
Georg Brandl7597add2011-10-05 16:36:47 +02001373 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001374 strings were created using _PyObject_New() and where no canonical
1375 representation (the str field) has been set yet aka strings
1376 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001377 assert(_PyUnicode_CHECK(unicode));
1378 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001379 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001380 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001381 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001382 /* Actually, it should neither be interned nor be anything else: */
1383 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384
1385#ifdef Py_DEBUG
1386 ++unicode_ready_calls;
1387#endif
1388
1389 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001390 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001391 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393
1394 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001395 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1396 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397 PyErr_NoMemory();
1398 return -1;
1399 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001400 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 _PyUnicode_WSTR(unicode), end,
1402 PyUnicode_1BYTE_DATA(unicode));
1403 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1404 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1405 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1406 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001407 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001409 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 }
1411 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001412 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001413 _PyUnicode_UTF8(unicode) = NULL;
1414 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 }
1416 PyObject_FREE(_PyUnicode_WSTR(unicode));
1417 _PyUnicode_WSTR(unicode) = NULL;
1418 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1419 }
1420 /* In this case we might have to convert down from 4-byte native
1421 wchar_t to 2-byte unicode. */
1422 else if (maxchar < 65536) {
1423 assert(num_surrogates == 0 &&
1424 "FindMaxCharAndNumSurrogatePairs() messed up");
1425
Victor Stinner506f5922011-09-28 22:34:18 +02001426#if SIZEOF_WCHAR_T == 2
1427 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001428 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434#else
1435 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001436 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001437 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001438 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001439 PyErr_NoMemory();
1440 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinner506f5922011-09-28 22:34:18 +02001442 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1443 _PyUnicode_WSTR(unicode), end,
1444 PyUnicode_2BYTE_DATA(unicode));
1445 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1446 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1447 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001448 _PyUnicode_UTF8(unicode) = NULL;
1449 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001450 PyObject_FREE(_PyUnicode_WSTR(unicode));
1451 _PyUnicode_WSTR(unicode) = NULL;
1452 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1453#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001454 }
1455 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1456 else {
1457#if SIZEOF_WCHAR_T == 2
1458 /* in case the native representation is 2-bytes, we need to allocate a
1459 new normalized 4-byte version. */
1460 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001461 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1462 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 PyErr_NoMemory();
1464 return -1;
1465 }
1466 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001468 _PyUnicode_UTF8(unicode) = NULL;
1469 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001470 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001472 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 PyObject_FREE(_PyUnicode_WSTR(unicode));
1474 _PyUnicode_WSTR(unicode) = NULL;
1475 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1476#else
1477 assert(num_surrogates == 0);
1478
Victor Stinnerc3c74152011-10-02 20:39:55 +02001479 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001481 _PyUnicode_UTF8(unicode) = NULL;
1482 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1484#endif
1485 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1486 }
1487 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001488 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 return 0;
1490}
1491
Alexander Belopolsky40018472011-02-26 01:02:56 +00001492static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001493unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494{
Walter Dörwald16807132007-05-25 13:52:07 +00001495 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 case SSTATE_NOT_INTERNED:
1497 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001498
Benjamin Peterson29060642009-01-31 22:14:21 +00001499 case SSTATE_INTERNED_MORTAL:
1500 /* revive dead object temporarily for DelItem */
1501 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001502 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001503 Py_FatalError(
1504 "deletion of interned string failed");
1505 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001506
Benjamin Peterson29060642009-01-31 22:14:21 +00001507 case SSTATE_INTERNED_IMMORTAL:
1508 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001509
Benjamin Peterson29060642009-01-31 22:14:21 +00001510 default:
1511 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001512 }
1513
Victor Stinner03490912011-10-03 23:45:12 +02001514 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001516 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001517 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001518 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1519 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001521 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001522}
1523
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524#ifdef Py_DEBUG
1525static int
1526unicode_is_singleton(PyObject *unicode)
1527{
1528 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1529 if (unicode == unicode_empty)
1530 return 1;
1531 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1532 {
1533 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1534 if (ch < 256 && unicode_latin1[ch] == unicode)
1535 return 1;
1536 }
1537 return 0;
1538}
1539#endif
1540
Alexander Belopolsky40018472011-02-26 01:02:56 +00001541static int
Victor Stinner488fa492011-12-12 00:01:39 +01001542unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543{
Victor Stinner488fa492011-12-12 00:01:39 +01001544 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001545 if (Py_REFCNT(unicode) != 1)
1546 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001547 if (_PyUnicode_HASH(unicode) != -1)
1548 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549 if (PyUnicode_CHECK_INTERNED(unicode))
1550 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001551 if (!PyUnicode_CheckExact(unicode))
1552 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001553#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001554 /* singleton refcount is greater than 1 */
1555 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001556#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001557 return 1;
1558}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001559
Victor Stinnerfe226c02011-10-03 03:52:20 +02001560static int
1561unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1562{
1563 PyObject *unicode;
1564 Py_ssize_t old_length;
1565
1566 assert(p_unicode != NULL);
1567 unicode = *p_unicode;
1568
1569 assert(unicode != NULL);
1570 assert(PyUnicode_Check(unicode));
1571 assert(0 <= length);
1572
Victor Stinner910337b2011-10-03 03:20:16 +02001573 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001574 old_length = PyUnicode_WSTR_LENGTH(unicode);
1575 else
1576 old_length = PyUnicode_GET_LENGTH(unicode);
1577 if (old_length == length)
1578 return 0;
1579
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001580 if (length == 0) {
1581 Py_DECREF(*p_unicode);
1582 *p_unicode = unicode_empty;
1583 Py_INCREF(*p_unicode);
1584 return 0;
1585 }
1586
Victor Stinner488fa492011-12-12 00:01:39 +01001587 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 PyObject *copy = resize_copy(unicode, length);
1589 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001590 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001591 Py_DECREF(*p_unicode);
1592 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001593 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001594 }
1595
Victor Stinnerfe226c02011-10-03 03:52:20 +02001596 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001597 PyObject *new_unicode = resize_compact(unicode, length);
1598 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001600 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001601 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001602 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001603 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001604 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001605}
1606
Alexander Belopolsky40018472011-02-26 01:02:56 +00001607int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001608PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001609{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001610 PyObject *unicode;
1611 if (p_unicode == NULL) {
1612 PyErr_BadInternalCall();
1613 return -1;
1614 }
1615 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001616 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001617 {
1618 PyErr_BadInternalCall();
1619 return -1;
1620 }
1621 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001622}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001623
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001624static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001625unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001626{
1627 PyObject *result;
1628 assert(PyUnicode_IS_READY(*p_unicode));
1629 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1630 return 0;
1631 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1632 maxchar);
1633 if (result == NULL)
1634 return -1;
1635 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1636 PyUnicode_GET_LENGTH(*p_unicode));
1637 Py_DECREF(*p_unicode);
1638 *p_unicode = result;
1639 return 0;
1640}
1641
1642static int
1643unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1644 Py_UCS4 ch)
1645{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001646 assert(ch <= MAX_UNICODE);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647 if (unicode_widen(p_unicode, ch) < 0)
1648 return -1;
1649 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1650 PyUnicode_DATA(*p_unicode),
1651 (*pos)++, ch);
1652 return 0;
1653}
1654
Victor Stinnerc5166102012-02-22 13:55:02 +01001655/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1656 Return the length of the input string.
1657
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001658 WARNING: The function doesn't copy the terminating null character and
1659 doesn't check the maximum character (may write a latin1 character in an
1660 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001661static Py_ssize_t
1662unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1663{
1664 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1665 void *data = PyUnicode_DATA(unicode);
1666
1667 switch (kind) {
1668 case PyUnicode_1BYTE_KIND: {
1669 Py_ssize_t len = strlen(str);
1670 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001671 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001672 return len;
1673 }
1674 case PyUnicode_2BYTE_KIND: {
1675 Py_UCS2 *start = (Py_UCS2 *)data + index;
1676 Py_UCS2 *ucs2 = start;
1677 assert(index <= PyUnicode_GET_LENGTH(unicode));
1678
1679 for (; *str; ++ucs2, ++str)
1680 *ucs2 = (Py_UCS2)*str;
1681
1682 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1683 return ucs2 - start;
1684 }
1685 default: {
1686 Py_UCS4 *start = (Py_UCS4 *)data + index;
1687 Py_UCS4 *ucs4 = start;
1688 assert(kind == PyUnicode_4BYTE_KIND);
1689 assert(index <= PyUnicode_GET_LENGTH(unicode));
1690
1691 for (; *str; ++ucs4, ++str)
1692 *ucs4 = (Py_UCS4)*str;
1693
1694 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1695 return ucs4 - start;
1696 }
1697 }
1698}
1699
1700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701static PyObject*
1702get_latin1_char(unsigned char ch)
1703{
Victor Stinnera464fc12011-10-02 20:39:30 +02001704 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001706 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001707 if (!unicode)
1708 return NULL;
1709 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001710 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 unicode_latin1[ch] = unicode;
1712 }
1713 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001714 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715}
1716
Alexander Belopolsky40018472011-02-26 01:02:56 +00001717PyObject *
1718PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001720 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721 Py_UCS4 maxchar = 0;
1722 Py_ssize_t num_surrogates;
1723
1724 if (u == NULL)
1725 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001727 /* If the Unicode data is known at construction time, we can apply
1728 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730 /* Optimization for empty strings */
1731 if (size == 0 && unicode_empty != NULL) {
1732 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001733 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001734 }
Tim Petersced69f82003-09-16 20:30:58 +00001735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 /* Single character Unicode objects in the Latin-1 range are
1737 shared when using this constructor */
1738 if (size == 1 && *u < 256)
1739 return get_latin1_char((unsigned char)*u);
1740
1741 /* If not empty and not single character, copy the Unicode data
1742 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001743 if (find_maxchar_surrogates(u, u + size,
1744 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 return NULL;
1746
Victor Stinner8faf8212011-12-08 22:14:11 +01001747 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 if (!unicode)
1749 return NULL;
1750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 switch (PyUnicode_KIND(unicode)) {
1752 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001753 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1755 break;
1756 case PyUnicode_2BYTE_KIND:
1757#if Py_UNICODE_SIZE == 2
1758 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1759#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001760 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1762#endif
1763 break;
1764 case PyUnicode_4BYTE_KIND:
1765#if SIZEOF_WCHAR_T == 2
1766 /* This is the only case which has to process surrogates, thus
1767 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001768 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769#else
1770 assert(num_surrogates == 0);
1771 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1772#endif
1773 break;
1774 default:
1775 assert(0 && "Impossible state");
1776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001778 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779}
1780
Alexander Belopolsky40018472011-02-26 01:02:56 +00001781PyObject *
1782PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001783{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001784 if (size < 0) {
1785 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001786 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001787 return NULL;
1788 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001789 if (u != NULL)
1790 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1791 else
1792 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001793}
1794
Alexander Belopolsky40018472011-02-26 01:02:56 +00001795PyObject *
1796PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001797{
1798 size_t size = strlen(u);
1799 if (size > PY_SSIZE_T_MAX) {
1800 PyErr_SetString(PyExc_OverflowError, "input too long");
1801 return NULL;
1802 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001803 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001804}
1805
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001806PyObject *
1807_PyUnicode_FromId(_Py_Identifier *id)
1808{
1809 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001810 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1811 strlen(id->string),
1812 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001813 if (!id->object)
1814 return NULL;
1815 PyUnicode_InternInPlace(&id->object);
1816 assert(!id->next);
1817 id->next = static_strings;
1818 static_strings = id;
1819 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001820 return id->object;
1821}
1822
1823void
1824_PyUnicode_ClearStaticStrings()
1825{
1826 _Py_Identifier *i;
1827 for (i = static_strings; i; i = i->next) {
1828 Py_DECREF(i->object);
1829 i->object = NULL;
1830 i->next = NULL;
1831 }
1832}
1833
Benjamin Peterson0df54292012-03-26 14:50:32 -04001834/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001835
Victor Stinnere57b1c02011-09-28 22:20:48 +02001836static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001837unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001838{
Victor Stinner785938e2011-12-11 20:09:03 +01001839 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001840 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001841#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001842 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001843#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001844 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001845 }
Victor Stinner785938e2011-12-11 20:09:03 +01001846 unicode = PyUnicode_New(size, 127);
1847 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001848 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001849 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1850 assert(_PyUnicode_CheckConsistency(unicode, 1));
1851 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001852}
1853
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001854static Py_UCS4
1855kind_maxchar_limit(unsigned int kind)
1856{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001857 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001858 case PyUnicode_1BYTE_KIND:
1859 return 0x80;
1860 case PyUnicode_2BYTE_KIND:
1861 return 0x100;
1862 case PyUnicode_4BYTE_KIND:
1863 return 0x10000;
1864 default:
1865 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001866 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001867 }
1868}
1869
Victor Stinner702c7342011-10-05 13:50:52 +02001870static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001871_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001872{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001874 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001875
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001876 if (size == 0) {
1877 Py_INCREF(unicode_empty);
1878 return unicode_empty;
1879 }
1880 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001881 if (size == 1)
1882 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001883
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001884 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001885 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 if (!res)
1887 return NULL;
1888 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001889 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001891}
1892
Victor Stinnere57b1c02011-09-28 22:20:48 +02001893static PyObject*
1894_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895{
1896 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001897 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001898
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001899 if (size == 0) {
1900 Py_INCREF(unicode_empty);
1901 return unicode_empty;
1902 }
1903 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001904 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001905 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001906
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001907 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001908 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 if (!res)
1910 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001911 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001913 else {
1914 _PyUnicode_CONVERT_BYTES(
1915 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1916 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001917 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 return res;
1919}
1920
Victor Stinnere57b1c02011-09-28 22:20:48 +02001921static PyObject*
1922_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923{
1924 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001925 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001926
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001927 if (size == 0) {
1928 Py_INCREF(unicode_empty);
1929 return unicode_empty;
1930 }
1931 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001932 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001933 return get_latin1_char((unsigned char)u[0]);
1934
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001936 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001937 if (!res)
1938 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001939 if (max_char < 256)
1940 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1941 PyUnicode_1BYTE_DATA(res));
1942 else if (max_char < 0x10000)
1943 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1944 PyUnicode_2BYTE_DATA(res));
1945 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001947 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 return res;
1949}
1950
1951PyObject*
1952PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1953{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001954 if (size < 0) {
1955 PyErr_SetString(PyExc_ValueError, "size must be positive");
1956 return NULL;
1957 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001958 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001960 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001962 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001964 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001965 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001966 PyErr_SetString(PyExc_SystemError, "invalid kind");
1967 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969}
1970
Victor Stinnerece58de2012-04-23 23:36:38 +02001971Py_UCS4
1972_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
1973{
1974 enum PyUnicode_Kind kind;
1975 void *startptr, *endptr;
1976
1977 assert(PyUnicode_IS_READY(unicode));
1978 assert(0 <= start);
1979 assert(end <= PyUnicode_GET_LENGTH(unicode));
1980 assert(start <= end);
1981
1982 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
1983 return PyUnicode_MAX_CHAR_VALUE(unicode);
1984
1985 if (start == end)
1986 return 127;
1987
1988 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04001989 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04001990 endptr = (char *)startptr + end * kind;
1991 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04001992 switch(kind) {
1993 case PyUnicode_1BYTE_KIND:
1994 return ucs1lib_find_max_char(startptr, endptr);
1995 case PyUnicode_2BYTE_KIND:
1996 return ucs2lib_find_max_char(startptr, endptr);
1997 case PyUnicode_4BYTE_KIND:
1998 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02001999 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002000 assert(0);
2001 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002002 }
2003}
2004
Victor Stinner25a4b292011-10-06 12:31:55 +02002005/* Ensure that a string uses the most efficient storage, if it is not the
2006 case: create a new string with of the right kind. Write NULL into *p_unicode
2007 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002008static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002009unicode_adjust_maxchar(PyObject **p_unicode)
2010{
2011 PyObject *unicode, *copy;
2012 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002013 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002014 unsigned int kind;
2015
2016 assert(p_unicode != NULL);
2017 unicode = *p_unicode;
2018 assert(PyUnicode_IS_READY(unicode));
2019 if (PyUnicode_IS_ASCII(unicode))
2020 return;
2021
2022 len = PyUnicode_GET_LENGTH(unicode);
2023 kind = PyUnicode_KIND(unicode);
2024 if (kind == PyUnicode_1BYTE_KIND) {
2025 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002026 max_char = ucs1lib_find_max_char(u, u + len);
2027 if (max_char >= 128)
2028 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002029 }
2030 else if (kind == PyUnicode_2BYTE_KIND) {
2031 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002032 max_char = ucs2lib_find_max_char(u, u + len);
2033 if (max_char >= 256)
2034 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002035 }
2036 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002037 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002038 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002039 max_char = ucs4lib_find_max_char(u, u + len);
2040 if (max_char >= 0x10000)
2041 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002042 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002043 copy = PyUnicode_New(len, max_char);
2044 copy_characters(copy, 0, unicode, 0, len);
2045 Py_DECREF(unicode);
2046 *p_unicode = copy;
2047}
2048
Victor Stinner034f6cf2011-09-30 02:26:44 +02002049PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002050_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002051{
Victor Stinner87af4f22011-11-21 23:03:47 +01002052 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002053 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002054
Victor Stinner034f6cf2011-09-30 02:26:44 +02002055 if (!PyUnicode_Check(unicode)) {
2056 PyErr_BadInternalCall();
2057 return NULL;
2058 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002059 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002060 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002061
Victor Stinner87af4f22011-11-21 23:03:47 +01002062 length = PyUnicode_GET_LENGTH(unicode);
2063 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002064 if (!copy)
2065 return NULL;
2066 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2067
Victor Stinner87af4f22011-11-21 23:03:47 +01002068 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2069 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002070 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002071 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002072}
2073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074
Victor Stinnerbc603d12011-10-02 01:00:40 +02002075/* Widen Unicode objects to larger buffers. Don't write terminating null
2076 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077
2078void*
2079_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2080{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002081 Py_ssize_t len;
2082 void *result;
2083 unsigned int skind;
2084
Benjamin Petersonbac79492012-01-14 13:34:47 -05002085 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002086 return NULL;
2087
2088 len = PyUnicode_GET_LENGTH(s);
2089 skind = PyUnicode_KIND(s);
2090 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002091 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 return NULL;
2093 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002094 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002095 case PyUnicode_2BYTE_KIND:
2096 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2097 if (!result)
2098 return PyErr_NoMemory();
2099 assert(skind == PyUnicode_1BYTE_KIND);
2100 _PyUnicode_CONVERT_BYTES(
2101 Py_UCS1, Py_UCS2,
2102 PyUnicode_1BYTE_DATA(s),
2103 PyUnicode_1BYTE_DATA(s) + len,
2104 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002105 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002106 case PyUnicode_4BYTE_KIND:
2107 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2108 if (!result)
2109 return PyErr_NoMemory();
2110 if (skind == PyUnicode_2BYTE_KIND) {
2111 _PyUnicode_CONVERT_BYTES(
2112 Py_UCS2, Py_UCS4,
2113 PyUnicode_2BYTE_DATA(s),
2114 PyUnicode_2BYTE_DATA(s) + len,
2115 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002116 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002117 else {
2118 assert(skind == PyUnicode_1BYTE_KIND);
2119 _PyUnicode_CONVERT_BYTES(
2120 Py_UCS1, Py_UCS4,
2121 PyUnicode_1BYTE_DATA(s),
2122 PyUnicode_1BYTE_DATA(s) + len,
2123 result);
2124 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002126 default:
2127 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002128 }
Victor Stinner01698042011-10-04 00:04:26 +02002129 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002130 return NULL;
2131}
2132
2133static Py_UCS4*
2134as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2135 int copy_null)
2136{
2137 int kind;
2138 void *data;
2139 Py_ssize_t len, targetlen;
2140 if (PyUnicode_READY(string) == -1)
2141 return NULL;
2142 kind = PyUnicode_KIND(string);
2143 data = PyUnicode_DATA(string);
2144 len = PyUnicode_GET_LENGTH(string);
2145 targetlen = len;
2146 if (copy_null)
2147 targetlen++;
2148 if (!target) {
2149 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2150 PyErr_NoMemory();
2151 return NULL;
2152 }
2153 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2154 if (!target) {
2155 PyErr_NoMemory();
2156 return NULL;
2157 }
2158 }
2159 else {
2160 if (targetsize < targetlen) {
2161 PyErr_Format(PyExc_SystemError,
2162 "string is longer than the buffer");
2163 if (copy_null && 0 < targetsize)
2164 target[0] = 0;
2165 return NULL;
2166 }
2167 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002168 if (kind == PyUnicode_1BYTE_KIND) {
2169 Py_UCS1 *start = (Py_UCS1 *) data;
2170 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002172 else if (kind == PyUnicode_2BYTE_KIND) {
2173 Py_UCS2 *start = (Py_UCS2 *) data;
2174 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2175 }
2176 else {
2177 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 if (copy_null)
2181 target[len] = 0;
2182 return target;
2183}
2184
2185Py_UCS4*
2186PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2187 int copy_null)
2188{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002189 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 PyErr_BadInternalCall();
2191 return NULL;
2192 }
2193 return as_ucs4(string, target, targetsize, copy_null);
2194}
2195
2196Py_UCS4*
2197PyUnicode_AsUCS4Copy(PyObject *string)
2198{
2199 return as_ucs4(string, NULL, 0, 1);
2200}
2201
2202#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002203
Alexander Belopolsky40018472011-02-26 01:02:56 +00002204PyObject *
2205PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002208 if (size == 0) {
2209 Py_INCREF(unicode_empty);
2210 return unicode_empty;
2211 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002212 PyErr_BadInternalCall();
2213 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 }
2215
Martin v. Löwis790465f2008-04-05 20:41:37 +00002216 if (size == -1) {
2217 size = wcslen(w);
2218 }
2219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221}
2222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002224
Walter Dörwald346737f2007-05-31 10:44:43 +00002225static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002226makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2227 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002228{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002229 *fmt++ = '%';
2230 if (width) {
2231 if (zeropad)
2232 *fmt++ = '0';
2233 fmt += sprintf(fmt, "%d", width);
2234 }
2235 if (precision)
2236 fmt += sprintf(fmt, ".%d", precision);
2237 if (longflag)
2238 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002239 else if (longlongflag) {
2240 /* longlongflag should only ever be nonzero on machines with
2241 HAVE_LONG_LONG defined */
2242#ifdef HAVE_LONG_LONG
2243 char *f = PY_FORMAT_LONG_LONG;
2244 while (*f)
2245 *fmt++ = *f++;
2246#else
2247 /* we shouldn't ever get here */
2248 assert(0);
2249 *fmt++ = 'l';
2250#endif
2251 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002252 else if (size_tflag) {
2253 char *f = PY_FORMAT_SIZE_T;
2254 while (*f)
2255 *fmt++ = *f++;
2256 }
2257 *fmt++ = c;
2258 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002259}
2260
Victor Stinner96865452011-03-01 23:44:09 +00002261/* helper for PyUnicode_FromFormatV() */
2262
2263static const char*
2264parse_format_flags(const char *f,
2265 int *p_width, int *p_precision,
2266 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2267{
2268 int width, precision, longflag, longlongflag, size_tflag;
2269
2270 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2271 f++;
2272 width = 0;
2273 while (Py_ISDIGIT((unsigned)*f))
2274 width = (width*10) + *f++ - '0';
2275 precision = 0;
2276 if (*f == '.') {
2277 f++;
2278 while (Py_ISDIGIT((unsigned)*f))
2279 precision = (precision*10) + *f++ - '0';
2280 if (*f == '%') {
2281 /* "%.3%s" => f points to "3" */
2282 f--;
2283 }
2284 }
2285 if (*f == '\0') {
2286 /* bogus format "%.1" => go backward, f points to "1" */
2287 f--;
2288 }
2289 if (p_width != NULL)
2290 *p_width = width;
2291 if (p_precision != NULL)
2292 *p_precision = precision;
2293
2294 /* Handle %ld, %lu, %lld and %llu. */
2295 longflag = 0;
2296 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002297 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002298
2299 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002300 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002301 longflag = 1;
2302 ++f;
2303 }
2304#ifdef HAVE_LONG_LONG
2305 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002306 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002307 longlongflag = 1;
2308 f += 2;
2309 }
2310#endif
2311 }
2312 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002313 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002314 size_tflag = 1;
2315 ++f;
2316 }
2317 if (p_longflag != NULL)
2318 *p_longflag = longflag;
2319 if (p_longlongflag != NULL)
2320 *p_longlongflag = longlongflag;
2321 if (p_size_tflag != NULL)
2322 *p_size_tflag = size_tflag;
2323 return f;
2324}
2325
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002326/* maximum number of characters required for output of %ld. 21 characters
2327 allows for 64-bit integers (in decimal) and an optional sign. */
2328#define MAX_LONG_CHARS 21
2329/* maximum number of characters required for output of %lld.
2330 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2331 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2332#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2333
Walter Dörwaldd2034312007-05-18 16:29:38 +00002334PyObject *
2335PyUnicode_FromFormatV(const char *format, va_list vargs)
2336{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002337 va_list count;
2338 Py_ssize_t callcount = 0;
2339 PyObject **callresults = NULL;
2340 PyObject **callresult = NULL;
2341 Py_ssize_t n = 0;
2342 int width = 0;
2343 int precision = 0;
2344 int zeropad;
2345 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002346 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002347 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002348 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2350 Py_UCS4 argmaxchar;
2351 Py_ssize_t numbersize = 0;
2352 char *numberresults = NULL;
2353 char *numberresult = NULL;
2354 Py_ssize_t i;
2355 int kind;
2356 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002357
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002358 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002359 /* step 1: count the number of %S/%R/%A/%s format specifications
2360 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2361 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002362 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002363 * also estimate a upper bound for all the number formats in the string,
2364 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002365 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002366 for (f = format; *f; f++) {
2367 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002368 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2370 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2371 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2372 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002375#ifdef HAVE_LONG_LONG
2376 if (longlongflag) {
2377 if (width < MAX_LONG_LONG_CHARS)
2378 width = MAX_LONG_LONG_CHARS;
2379 }
2380 else
2381#endif
2382 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2383 including sign. Decimal takes the most space. This
2384 isn't enough for octal. If a width is specified we
2385 need more (which we allocate later). */
2386 if (width < MAX_LONG_CHARS)
2387 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388
2389 /* account for the size + '\0' to separate numbers
2390 inside of the numberresults buffer */
2391 numbersize += (width + 1);
2392 }
2393 }
2394 else if ((unsigned char)*f > 127) {
2395 PyErr_Format(PyExc_ValueError,
2396 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2397 "string, got a non-ASCII byte: 0x%02x",
2398 (unsigned char)*f);
2399 return NULL;
2400 }
2401 }
2402 /* step 2: allocate memory for the results of
2403 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2404 if (callcount) {
2405 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2406 if (!callresults) {
2407 PyErr_NoMemory();
2408 return NULL;
2409 }
2410 callresult = callresults;
2411 }
2412 /* step 2.5: allocate memory for the results of formating numbers */
2413 if (numbersize) {
2414 numberresults = PyObject_Malloc(numbersize);
2415 if (!numberresults) {
2416 PyErr_NoMemory();
2417 goto fail;
2418 }
2419 numberresult = numberresults;
2420 }
2421
2422 /* step 3: format numbers and figure out how large a buffer we need */
2423 for (f = format; *f; f++) {
2424 if (*f == '%') {
2425 const char* p;
2426 int longflag;
2427 int longlongflag;
2428 int size_tflag;
2429 int numprinted;
2430
2431 p = f;
2432 zeropad = (f[1] == '0');
2433 f = parse_format_flags(f, &width, &precision,
2434 &longflag, &longlongflag, &size_tflag);
2435 switch (*f) {
2436 case 'c':
2437 {
2438 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002439 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440 n++;
2441 break;
2442 }
2443 case '%':
2444 n++;
2445 break;
2446 case 'i':
2447 case 'd':
2448 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2449 width, precision, *f);
2450 if (longflag)
2451 numprinted = sprintf(numberresult, fmt,
2452 va_arg(count, long));
2453#ifdef HAVE_LONG_LONG
2454 else if (longlongflag)
2455 numprinted = sprintf(numberresult, fmt,
2456 va_arg(count, PY_LONG_LONG));
2457#endif
2458 else if (size_tflag)
2459 numprinted = sprintf(numberresult, fmt,
2460 va_arg(count, Py_ssize_t));
2461 else
2462 numprinted = sprintf(numberresult, fmt,
2463 va_arg(count, int));
2464 n += numprinted;
2465 /* advance by +1 to skip over the '\0' */
2466 numberresult += (numprinted + 1);
2467 assert(*(numberresult - 1) == '\0');
2468 assert(*(numberresult - 2) != '\0');
2469 assert(numprinted >= 0);
2470 assert(numberresult <= numberresults + numbersize);
2471 break;
2472 case 'u':
2473 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2474 width, precision, 'u');
2475 if (longflag)
2476 numprinted = sprintf(numberresult, fmt,
2477 va_arg(count, unsigned long));
2478#ifdef HAVE_LONG_LONG
2479 else if (longlongflag)
2480 numprinted = sprintf(numberresult, fmt,
2481 va_arg(count, unsigned PY_LONG_LONG));
2482#endif
2483 else if (size_tflag)
2484 numprinted = sprintf(numberresult, fmt,
2485 va_arg(count, size_t));
2486 else
2487 numprinted = sprintf(numberresult, fmt,
2488 va_arg(count, unsigned int));
2489 n += numprinted;
2490 numberresult += (numprinted + 1);
2491 assert(*(numberresult - 1) == '\0');
2492 assert(*(numberresult - 2) != '\0');
2493 assert(numprinted >= 0);
2494 assert(numberresult <= numberresults + numbersize);
2495 break;
2496 case 'x':
2497 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2498 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2499 n += numprinted;
2500 numberresult += (numprinted + 1);
2501 assert(*(numberresult - 1) == '\0');
2502 assert(*(numberresult - 2) != '\0');
2503 assert(numprinted >= 0);
2504 assert(numberresult <= numberresults + numbersize);
2505 break;
2506 case 'p':
2507 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2508 /* %p is ill-defined: ensure leading 0x. */
2509 if (numberresult[1] == 'X')
2510 numberresult[1] = 'x';
2511 else if (numberresult[1] != 'x') {
2512 memmove(numberresult + 2, numberresult,
2513 strlen(numberresult) + 1);
2514 numberresult[0] = '0';
2515 numberresult[1] = 'x';
2516 numprinted += 2;
2517 }
2518 n += numprinted;
2519 numberresult += (numprinted + 1);
2520 assert(*(numberresult - 1) == '\0');
2521 assert(*(numberresult - 2) != '\0');
2522 assert(numprinted >= 0);
2523 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002524 break;
2525 case 's':
2526 {
2527 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002528 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002529 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002530 if (!str)
2531 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002532 /* since PyUnicode_DecodeUTF8 returns already flexible
2533 unicode objects, there is no need to call ready on them */
2534 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002535 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002537 /* Remember the str and switch to the next slot */
2538 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002539 break;
2540 }
2541 case 'U':
2542 {
2543 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002544 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 if (PyUnicode_READY(obj) == -1)
2546 goto fail;
2547 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002548 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002549 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002550 break;
2551 }
2552 case 'V':
2553 {
2554 PyObject *obj = va_arg(count, PyObject *);
2555 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002556 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002558 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002559 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002560 if (PyUnicode_READY(obj) == -1)
2561 goto fail;
2562 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002563 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002564 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002565 *callresult++ = NULL;
2566 }
2567 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002568 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002569 if (!str_obj)
2570 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002571 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002572 Py_DECREF(str_obj);
2573 goto fail;
2574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002576 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002578 *callresult++ = str_obj;
2579 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002580 break;
2581 }
2582 case 'S':
2583 {
2584 PyObject *obj = va_arg(count, PyObject *);
2585 PyObject *str;
2586 assert(obj);
2587 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002588 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002590 if (PyUnicode_READY(str) == -1) {
2591 Py_DECREF(str);
2592 goto fail;
2593 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002595 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002596 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 /* Remember the str and switch to the next slot */
2598 *callresult++ = str;
2599 break;
2600 }
2601 case 'R':
2602 {
2603 PyObject *obj = va_arg(count, PyObject *);
2604 PyObject *repr;
2605 assert(obj);
2606 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002607 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002608 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002609 if (PyUnicode_READY(repr) == -1) {
2610 Py_DECREF(repr);
2611 goto fail;
2612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002614 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 /* Remember the repr and switch to the next slot */
2617 *callresult++ = repr;
2618 break;
2619 }
2620 case 'A':
2621 {
2622 PyObject *obj = va_arg(count, PyObject *);
2623 PyObject *ascii;
2624 assert(obj);
2625 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002626 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002628 if (PyUnicode_READY(ascii) == -1) {
2629 Py_DECREF(ascii);
2630 goto fail;
2631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002633 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002635 /* Remember the repr and switch to the next slot */
2636 *callresult++ = ascii;
2637 break;
2638 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 default:
2640 /* if we stumble upon an unknown
2641 formatting code, copy the rest of
2642 the format string to the output
2643 string. (we cannot just skip the
2644 code, since there's no way to know
2645 what's in the argument list) */
2646 n += strlen(p);
2647 goto expand;
2648 }
2649 } else
2650 n++;
2651 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002652 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002653 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 we don't have to resize the string.
2656 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002657 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002658 if (!string)
2659 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002660 kind = PyUnicode_KIND(string);
2661 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002662 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002665 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002667 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002668
2669 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2671 /* checking for == because the last argument could be a empty
2672 string, which causes i to point to end, the assert at the end of
2673 the loop */
2674 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002675
Benjamin Peterson14339b62009-01-31 16:36:08 +00002676 switch (*f) {
2677 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002678 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002679 const int ordinal = va_arg(vargs, int);
2680 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002681 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002682 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002683 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002685 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002686 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002687 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002688 {
2689 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 /* unused, since we already have the result */
2691 if (*f == 'p')
2692 (void) va_arg(vargs, void *);
2693 else
2694 (void) va_arg(vargs, int);
2695 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002696 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002698 i += written;
2699 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 assert(*numberresult == '\0');
2701 numberresult++;
2702 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002703 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002704 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 case 's':
2706 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002707 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002708 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002709 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002710 size = PyUnicode_GET_LENGTH(*callresult);
2711 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002712 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002713 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002714 /* We're done with the unicode()/repr() => forget it */
2715 Py_DECREF(*callresult);
2716 /* switch to next unicode()/repr() result */
2717 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002718 break;
2719 }
2720 case 'U':
2721 {
2722 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723 Py_ssize_t size;
2724 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2725 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002726 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002727 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002728 break;
2729 }
2730 case 'V':
2731 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002733 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002734 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002735 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 size = PyUnicode_GET_LENGTH(obj);
2737 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002738 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002740 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 size = PyUnicode_GET_LENGTH(*callresult);
2742 assert(PyUnicode_KIND(*callresult) <=
2743 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002744 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002745 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002746 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002747 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002748 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 break;
2750 }
2751 case 'S':
2752 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002753 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002754 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002755 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002756 /* unused, since we already have the result */
2757 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002758 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002759 copy_characters(string, i, *callresult, 0, size);
2760 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002761 /* We're done with the unicode()/repr() => forget it */
2762 Py_DECREF(*callresult);
2763 /* switch to next unicode()/repr() result */
2764 ++callresult;
2765 break;
2766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002767 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002768 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002769 break;
2770 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002771 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002772 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002773 goto end;
2774 }
Victor Stinner1205f272010-09-11 00:54:47 +00002775 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002776 else {
2777 assert(i < PyUnicode_GET_LENGTH(string));
2778 PyUnicode_WRITE(kind, data, i++, *f);
2779 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002782
Benjamin Peterson29060642009-01-31 22:14:21 +00002783 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 if (callresults)
2785 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002786 if (numberresults)
2787 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002788 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002789 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002790 if (callresults) {
2791 PyObject **callresult2 = callresults;
2792 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002793 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002794 ++callresult2;
2795 }
2796 PyObject_Free(callresults);
2797 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002798 if (numberresults)
2799 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002800 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002801}
2802
Walter Dörwaldd2034312007-05-18 16:29:38 +00002803PyObject *
2804PyUnicode_FromFormat(const char *format, ...)
2805{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002806 PyObject* ret;
2807 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002808
2809#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002810 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002811#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002812 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002813#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002814 ret = PyUnicode_FromFormatV(format, vargs);
2815 va_end(vargs);
2816 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002817}
2818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819#ifdef HAVE_WCHAR_H
2820
Victor Stinner5593d8a2010-10-02 11:11:27 +00002821/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2822 convert a Unicode object to a wide character string.
2823
Victor Stinnerd88d9832011-09-06 02:00:05 +02002824 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002825 character) required to convert the unicode object. Ignore size argument.
2826
Victor Stinnerd88d9832011-09-06 02:00:05 +02002827 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002828 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002829 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002830static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002831unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002832 wchar_t *w,
2833 Py_ssize_t size)
2834{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002835 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002836 const wchar_t *wstr;
2837
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002838 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002839 if (wstr == NULL)
2840 return -1;
2841
Victor Stinner5593d8a2010-10-02 11:11:27 +00002842 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002843 if (size > res)
2844 size = res + 1;
2845 else
2846 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002847 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002848 return res;
2849 }
2850 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002851 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002852}
2853
2854Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002855PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002856 wchar_t *w,
2857 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858{
2859 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 PyErr_BadInternalCall();
2861 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002863 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864}
2865
Victor Stinner137c34c2010-09-29 10:25:54 +00002866wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002867PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002868 Py_ssize_t *size)
2869{
2870 wchar_t* buffer;
2871 Py_ssize_t buflen;
2872
2873 if (unicode == NULL) {
2874 PyErr_BadInternalCall();
2875 return NULL;
2876 }
2877
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002878 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002879 if (buflen == -1)
2880 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002881 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002882 PyErr_NoMemory();
2883 return NULL;
2884 }
2885
Victor Stinner137c34c2010-09-29 10:25:54 +00002886 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2887 if (buffer == NULL) {
2888 PyErr_NoMemory();
2889 return NULL;
2890 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002891 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002892 if (buflen == -1)
2893 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002894 if (size != NULL)
2895 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002896 return buffer;
2897}
2898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900
Alexander Belopolsky40018472011-02-26 01:02:56 +00002901PyObject *
2902PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002903{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002904 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002905 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002906 PyErr_SetString(PyExc_ValueError,
2907 "chr() arg not in range(0x110000)");
2908 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002909 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002911 if (ordinal < 256)
2912 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002914 v = PyUnicode_New(1, ordinal);
2915 if (v == NULL)
2916 return NULL;
2917 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002918 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002919 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002920}
2921
Alexander Belopolsky40018472011-02-26 01:02:56 +00002922PyObject *
2923PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002925 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002926 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002927 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002928 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002929 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002930 Py_INCREF(obj);
2931 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002932 }
2933 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 /* For a Unicode subtype that's not a Unicode object,
2935 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002936 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002937 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002938 PyErr_Format(PyExc_TypeError,
2939 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002940 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002941 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002942}
2943
Alexander Belopolsky40018472011-02-26 01:02:56 +00002944PyObject *
2945PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002946 const char *encoding,
2947 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002948{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002949 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002950 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002951
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002953 PyErr_BadInternalCall();
2954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002956
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002957 /* Decoding bytes objects is the most common case and should be fast */
2958 if (PyBytes_Check(obj)) {
2959 if (PyBytes_GET_SIZE(obj) == 0) {
2960 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002961 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002962 }
2963 else {
2964 v = PyUnicode_Decode(
2965 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2966 encoding, errors);
2967 }
2968 return v;
2969 }
2970
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002971 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002972 PyErr_SetString(PyExc_TypeError,
2973 "decoding str is not supported");
2974 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002975 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002976
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002977 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2978 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2979 PyErr_Format(PyExc_TypeError,
2980 "coercing to str: need bytes, bytearray "
2981 "or buffer-like object, %.80s found",
2982 Py_TYPE(obj)->tp_name);
2983 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002984 }
Tim Petersced69f82003-09-16 20:30:58 +00002985
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002986 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002987 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002988 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 }
Tim Petersced69f82003-09-16 20:30:58 +00002990 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002991 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002992
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002993 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002994 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995}
2996
Victor Stinner600d3be2010-06-10 12:00:55 +00002997/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002998 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2999 1 on success. */
3000static int
3001normalize_encoding(const char *encoding,
3002 char *lower,
3003 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003005 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003006 char *l;
3007 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003008
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003009 if (encoding == NULL) {
3010 strcpy(lower, "utf-8");
3011 return 1;
3012 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003013 e = encoding;
3014 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003015 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003016 while (*e) {
3017 if (l == l_end)
3018 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003019 if (Py_ISUPPER(*e)) {
3020 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003021 }
3022 else if (*e == '_') {
3023 *l++ = '-';
3024 e++;
3025 }
3026 else {
3027 *l++ = *e++;
3028 }
3029 }
3030 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003031 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003032}
3033
Alexander Belopolsky40018472011-02-26 01:02:56 +00003034PyObject *
3035PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003036 Py_ssize_t size,
3037 const char *encoding,
3038 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003039{
3040 PyObject *buffer = NULL, *unicode;
3041 Py_buffer info;
3042 char lower[11]; /* Enough for any encoding shortcut */
3043
Fred Drakee4315f52000-05-09 19:53:39 +00003044 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003045 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003046 if ((strcmp(lower, "utf-8") == 0) ||
3047 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003048 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003049 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003050 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003051 (strcmp(lower, "iso-8859-1") == 0))
3052 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003053#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003054 else if (strcmp(lower, "mbcs") == 0)
3055 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003056#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003057 else if (strcmp(lower, "ascii") == 0)
3058 return PyUnicode_DecodeASCII(s, size, errors);
3059 else if (strcmp(lower, "utf-16") == 0)
3060 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3061 else if (strcmp(lower, "utf-32") == 0)
3062 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064
3065 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003066 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003067 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003068 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003069 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 if (buffer == NULL)
3071 goto onError;
3072 unicode = PyCodec_Decode(buffer, encoding, errors);
3073 if (unicode == NULL)
3074 goto onError;
3075 if (!PyUnicode_Check(unicode)) {
3076 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003077 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003078 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 Py_DECREF(unicode);
3080 goto onError;
3081 }
3082 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003083 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003084
Benjamin Peterson29060642009-01-31 22:14:21 +00003085 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086 Py_XDECREF(buffer);
3087 return NULL;
3088}
3089
Alexander Belopolsky40018472011-02-26 01:02:56 +00003090PyObject *
3091PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003092 const char *encoding,
3093 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003094{
3095 PyObject *v;
3096
3097 if (!PyUnicode_Check(unicode)) {
3098 PyErr_BadArgument();
3099 goto onError;
3100 }
3101
3102 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003103 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003104
3105 /* Decode via the codec registry */
3106 v = PyCodec_Decode(unicode, encoding, errors);
3107 if (v == NULL)
3108 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003109 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003110
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003112 return NULL;
3113}
3114
Alexander Belopolsky40018472011-02-26 01:02:56 +00003115PyObject *
3116PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003117 const char *encoding,
3118 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003119{
3120 PyObject *v;
3121
3122 if (!PyUnicode_Check(unicode)) {
3123 PyErr_BadArgument();
3124 goto onError;
3125 }
3126
3127 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003128 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003129
3130 /* Decode via the codec registry */
3131 v = PyCodec_Decode(unicode, encoding, errors);
3132 if (v == NULL)
3133 goto onError;
3134 if (!PyUnicode_Check(v)) {
3135 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003136 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003137 Py_TYPE(v)->tp_name);
3138 Py_DECREF(v);
3139 goto onError;
3140 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003141 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003142
Benjamin Peterson29060642009-01-31 22:14:21 +00003143 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003144 return NULL;
3145}
3146
Alexander Belopolsky40018472011-02-26 01:02:56 +00003147PyObject *
3148PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003149 Py_ssize_t size,
3150 const char *encoding,
3151 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152{
3153 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003154
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 unicode = PyUnicode_FromUnicode(s, size);
3156 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3159 Py_DECREF(unicode);
3160 return v;
3161}
3162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163PyObject *
3164PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003165 const char *encoding,
3166 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003167{
3168 PyObject *v;
3169
3170 if (!PyUnicode_Check(unicode)) {
3171 PyErr_BadArgument();
3172 goto onError;
3173 }
3174
3175 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003177
3178 /* Encode via the codec registry */
3179 v = PyCodec_Encode(unicode, encoding, errors);
3180 if (v == NULL)
3181 goto onError;
3182 return v;
3183
Benjamin Peterson29060642009-01-31 22:14:21 +00003184 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003185 return NULL;
3186}
3187
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003188static size_t
3189wcstombs_errorpos(const wchar_t *wstr)
3190{
3191 size_t len;
3192#if SIZEOF_WCHAR_T == 2
3193 wchar_t buf[3];
3194#else
3195 wchar_t buf[2];
3196#endif
3197 char outbuf[MB_LEN_MAX];
3198 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003199
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003200#if SIZEOF_WCHAR_T == 2
3201 buf[2] = 0;
3202#else
3203 buf[1] = 0;
3204#endif
3205 start = wstr;
3206 while (*wstr != L'\0')
3207 {
3208 previous = wstr;
3209#if SIZEOF_WCHAR_T == 2
3210 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3211 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3212 {
3213 buf[0] = wstr[0];
3214 buf[1] = wstr[1];
3215 wstr += 2;
3216 }
3217 else {
3218 buf[0] = *wstr;
3219 buf[1] = 0;
3220 wstr++;
3221 }
3222#else
3223 buf[0] = *wstr;
3224 wstr++;
3225#endif
3226 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003227 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003228 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003229 }
3230
3231 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003232 return 0;
3233}
3234
Victor Stinner1b579672011-12-17 05:47:23 +01003235static int
3236locale_error_handler(const char *errors, int *surrogateescape)
3237{
3238 if (errors == NULL) {
3239 *surrogateescape = 0;
3240 return 0;
3241 }
3242
3243 if (strcmp(errors, "strict") == 0) {
3244 *surrogateescape = 0;
3245 return 0;
3246 }
3247 if (strcmp(errors, "surrogateescape") == 0) {
3248 *surrogateescape = 1;
3249 return 0;
3250 }
3251 PyErr_Format(PyExc_ValueError,
3252 "only 'strict' and 'surrogateescape' error handlers "
3253 "are supported, not '%s'",
3254 errors);
3255 return -1;
3256}
3257
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003258PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003259PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003260{
3261 Py_ssize_t wlen, wlen2;
3262 wchar_t *wstr;
3263 PyObject *bytes = NULL;
3264 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003265 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003266 PyObject *exc;
3267 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003268 int surrogateescape;
3269
3270 if (locale_error_handler(errors, &surrogateescape) < 0)
3271 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003272
3273 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3274 if (wstr == NULL)
3275 return NULL;
3276
3277 wlen2 = wcslen(wstr);
3278 if (wlen2 != wlen) {
3279 PyMem_Free(wstr);
3280 PyErr_SetString(PyExc_TypeError, "embedded null character");
3281 return NULL;
3282 }
3283
3284 if (surrogateescape) {
3285 /* locale encoding with surrogateescape */
3286 char *str;
3287
3288 str = _Py_wchar2char(wstr, &error_pos);
3289 if (str == NULL) {
3290 if (error_pos == (size_t)-1) {
3291 PyErr_NoMemory();
3292 PyMem_Free(wstr);
3293 return NULL;
3294 }
3295 else {
3296 goto encode_error;
3297 }
3298 }
3299 PyMem_Free(wstr);
3300
3301 bytes = PyBytes_FromString(str);
3302 PyMem_Free(str);
3303 }
3304 else {
3305 size_t len, len2;
3306
3307 len = wcstombs(NULL, wstr, 0);
3308 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003309 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003310 goto encode_error;
3311 }
3312
3313 bytes = PyBytes_FromStringAndSize(NULL, len);
3314 if (bytes == NULL) {
3315 PyMem_Free(wstr);
3316 return NULL;
3317 }
3318
3319 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3320 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003321 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003322 goto encode_error;
3323 }
3324 PyMem_Free(wstr);
3325 }
3326 return bytes;
3327
3328encode_error:
3329 errmsg = strerror(errno);
3330 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003331
3332 if (error_pos == (size_t)-1)
3333 error_pos = wcstombs_errorpos(wstr);
3334
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003335 PyMem_Free(wstr);
3336 Py_XDECREF(bytes);
3337
Victor Stinner2f197072011-12-17 07:08:30 +01003338 if (errmsg != NULL) {
3339 size_t errlen;
3340 wstr = _Py_char2wchar(errmsg, &errlen);
3341 if (wstr != NULL) {
3342 reason = PyUnicode_FromWideChar(wstr, errlen);
3343 PyMem_Free(wstr);
3344 } else
3345 errmsg = NULL;
3346 }
3347 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003348 reason = PyUnicode_FromString(
3349 "wcstombs() encountered an unencodable "
3350 "wide character");
3351 if (reason == NULL)
3352 return NULL;
3353
3354 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3355 "locale", unicode,
3356 (Py_ssize_t)error_pos,
3357 (Py_ssize_t)(error_pos+1),
3358 reason);
3359 Py_DECREF(reason);
3360 if (exc != NULL) {
3361 PyCodec_StrictErrors(exc);
3362 Py_XDECREF(exc);
3363 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003364 return NULL;
3365}
3366
Victor Stinnerad158722010-10-27 00:25:46 +00003367PyObject *
3368PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003369{
Victor Stinner99b95382011-07-04 14:23:54 +02003370#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003371 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003372#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003373 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003374#else
Victor Stinner793b5312011-04-27 00:24:21 +02003375 PyInterpreterState *interp = PyThreadState_GET()->interp;
3376 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3377 cannot use it to encode and decode filenames before it is loaded. Load
3378 the Python codec requires to encode at least its own filename. Use the C
3379 version of the locale codec until the codec registry is initialized and
3380 the Python codec is loaded.
3381
3382 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3383 cannot only rely on it: check also interp->fscodec_initialized for
3384 subinterpreters. */
3385 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003386 return PyUnicode_AsEncodedString(unicode,
3387 Py_FileSystemDefaultEncoding,
3388 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003389 }
3390 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003391 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003392 }
Victor Stinnerad158722010-10-27 00:25:46 +00003393#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003394}
3395
Alexander Belopolsky40018472011-02-26 01:02:56 +00003396PyObject *
3397PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003398 const char *encoding,
3399 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400{
3401 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003402 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003403
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404 if (!PyUnicode_Check(unicode)) {
3405 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 }
Fred Drakee4315f52000-05-09 19:53:39 +00003408
Fred Drakee4315f52000-05-09 19:53:39 +00003409 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003410 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003411 if ((strcmp(lower, "utf-8") == 0) ||
3412 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003413 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003414 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003416 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003417 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003418 }
Victor Stinner37296e82010-06-10 13:36:23 +00003419 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003420 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003421 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003422 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003423#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003424 else if (strcmp(lower, "mbcs") == 0)
3425 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003426#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003427 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003428 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430
3431 /* Encode via the codec registry */
3432 v = PyCodec_Encode(unicode, encoding, errors);
3433 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003434 return NULL;
3435
3436 /* The normal path */
3437 if (PyBytes_Check(v))
3438 return v;
3439
3440 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003441 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003442 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003443 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003444
3445 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3446 "encoder %s returned bytearray instead of bytes",
3447 encoding);
3448 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003449 Py_DECREF(v);
3450 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003451 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003452
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003453 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3454 Py_DECREF(v);
3455 return b;
3456 }
3457
3458 PyErr_Format(PyExc_TypeError,
3459 "encoder did not return a bytes object (type=%.400s)",
3460 Py_TYPE(v)->tp_name);
3461 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003462 return NULL;
3463}
3464
Alexander Belopolsky40018472011-02-26 01:02:56 +00003465PyObject *
3466PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003467 const char *encoding,
3468 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003469{
3470 PyObject *v;
3471
3472 if (!PyUnicode_Check(unicode)) {
3473 PyErr_BadArgument();
3474 goto onError;
3475 }
3476
3477 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003478 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003479
3480 /* Encode via the codec registry */
3481 v = PyCodec_Encode(unicode, encoding, errors);
3482 if (v == NULL)
3483 goto onError;
3484 if (!PyUnicode_Check(v)) {
3485 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003486 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003487 Py_TYPE(v)->tp_name);
3488 Py_DECREF(v);
3489 goto onError;
3490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003492
Benjamin Peterson29060642009-01-31 22:14:21 +00003493 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 return NULL;
3495}
3496
Victor Stinner2f197072011-12-17 07:08:30 +01003497static size_t
3498mbstowcs_errorpos(const char *str, size_t len)
3499{
3500#ifdef HAVE_MBRTOWC
3501 const char *start = str;
3502 mbstate_t mbs;
3503 size_t converted;
3504 wchar_t ch;
3505
3506 memset(&mbs, 0, sizeof mbs);
3507 while (len)
3508 {
3509 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3510 if (converted == 0)
3511 /* Reached end of string */
3512 break;
3513 if (converted == (size_t)-1 || converted == (size_t)-2) {
3514 /* Conversion error or incomplete character */
3515 return str - start;
3516 }
3517 else {
3518 str += converted;
3519 len -= converted;
3520 }
3521 }
3522 /* failed to find the undecodable byte sequence */
3523 return 0;
3524#endif
3525 return 0;
3526}
3527
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003528PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003529PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003530 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003531{
3532 wchar_t smallbuf[256];
3533 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3534 wchar_t *wstr;
3535 size_t wlen, wlen2;
3536 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003537 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003538 size_t error_pos;
3539 char *errmsg;
3540 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003541
3542 if (locale_error_handler(errors, &surrogateescape) < 0)
3543 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003544
3545 if (str[len] != '\0' || len != strlen(str)) {
3546 PyErr_SetString(PyExc_TypeError, "embedded null character");
3547 return NULL;
3548 }
3549
3550 if (surrogateescape)
3551 {
3552 wstr = _Py_char2wchar(str, &wlen);
3553 if (wstr == NULL) {
3554 if (wlen == (size_t)-1)
3555 PyErr_NoMemory();
3556 else
3557 PyErr_SetFromErrno(PyExc_OSError);
3558 return NULL;
3559 }
3560
3561 unicode = PyUnicode_FromWideChar(wstr, wlen);
3562 PyMem_Free(wstr);
3563 }
3564 else {
3565#ifndef HAVE_BROKEN_MBSTOWCS
3566 wlen = mbstowcs(NULL, str, 0);
3567#else
3568 wlen = len;
3569#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003570 if (wlen == (size_t)-1)
3571 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003572 if (wlen+1 <= smallbuf_len) {
3573 wstr = smallbuf;
3574 }
3575 else {
3576 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3577 return PyErr_NoMemory();
3578
3579 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3580 if (!wstr)
3581 return PyErr_NoMemory();
3582 }
3583
3584 /* This shouldn't fail now */
3585 wlen2 = mbstowcs(wstr, str, wlen+1);
3586 if (wlen2 == (size_t)-1) {
3587 if (wstr != smallbuf)
3588 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003589 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003590 }
3591#ifdef HAVE_BROKEN_MBSTOWCS
3592 assert(wlen2 == wlen);
3593#endif
3594 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3595 if (wstr != smallbuf)
3596 PyMem_Free(wstr);
3597 }
3598 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003599
3600decode_error:
3601 errmsg = strerror(errno);
3602 assert(errmsg != NULL);
3603
3604 error_pos = mbstowcs_errorpos(str, len);
3605 if (errmsg != NULL) {
3606 size_t errlen;
3607 wstr = _Py_char2wchar(errmsg, &errlen);
3608 if (wstr != NULL) {
3609 reason = PyUnicode_FromWideChar(wstr, errlen);
3610 PyMem_Free(wstr);
3611 } else
3612 errmsg = NULL;
3613 }
3614 if (errmsg == NULL)
3615 reason = PyUnicode_FromString(
3616 "mbstowcs() encountered an invalid multibyte sequence");
3617 if (reason == NULL)
3618 return NULL;
3619
3620 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3621 "locale", str, len,
3622 (Py_ssize_t)error_pos,
3623 (Py_ssize_t)(error_pos+1),
3624 reason);
3625 Py_DECREF(reason);
3626 if (exc != NULL) {
3627 PyCodec_StrictErrors(exc);
3628 Py_XDECREF(exc);
3629 }
3630 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003631}
3632
3633PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003634PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003635{
3636 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003637 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003638}
3639
3640
3641PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003642PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003643 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003644 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3645}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003646
Christian Heimes5894ba72007-11-04 11:43:14 +00003647PyObject*
3648PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3649{
Victor Stinner99b95382011-07-04 14:23:54 +02003650#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003651 return PyUnicode_DecodeMBCS(s, size, NULL);
3652#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003653 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003654#else
Victor Stinner793b5312011-04-27 00:24:21 +02003655 PyInterpreterState *interp = PyThreadState_GET()->interp;
3656 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3657 cannot use it to encode and decode filenames before it is loaded. Load
3658 the Python codec requires to encode at least its own filename. Use the C
3659 version of the locale codec until the codec registry is initialized and
3660 the Python codec is loaded.
3661
3662 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3663 cannot only rely on it: check also interp->fscodec_initialized for
3664 subinterpreters. */
3665 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003666 return PyUnicode_Decode(s, size,
3667 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003668 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003669 }
3670 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003671 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003672 }
Victor Stinnerad158722010-10-27 00:25:46 +00003673#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003674}
3675
Martin v. Löwis011e8422009-05-05 04:43:17 +00003676
3677int
Antoine Pitrou13348842012-01-29 18:36:34 +01003678_PyUnicode_HasNULChars(PyObject* s)
3679{
3680 static PyObject *nul = NULL;
3681
3682 if (nul == NULL)
3683 nul = PyUnicode_FromStringAndSize("\0", 1);
3684 if (nul == NULL)
3685 return -1;
3686 return PyUnicode_Contains(s, nul);
3687}
3688
3689
3690int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003691PyUnicode_FSConverter(PyObject* arg, void* addr)
3692{
3693 PyObject *output = NULL;
3694 Py_ssize_t size;
3695 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003696 if (arg == NULL) {
3697 Py_DECREF(*(PyObject**)addr);
3698 return 1;
3699 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003700 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003701 output = arg;
3702 Py_INCREF(output);
3703 }
3704 else {
3705 arg = PyUnicode_FromObject(arg);
3706 if (!arg)
3707 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003708 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003709 Py_DECREF(arg);
3710 if (!output)
3711 return 0;
3712 if (!PyBytes_Check(output)) {
3713 Py_DECREF(output);
3714 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3715 return 0;
3716 }
3717 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003718 size = PyBytes_GET_SIZE(output);
3719 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003720 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003721 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003722 Py_DECREF(output);
3723 return 0;
3724 }
3725 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003726 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003727}
3728
3729
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003730int
3731PyUnicode_FSDecoder(PyObject* arg, void* addr)
3732{
3733 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003734 if (arg == NULL) {
3735 Py_DECREF(*(PyObject**)addr);
3736 return 1;
3737 }
3738 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003739 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003741 output = arg;
3742 Py_INCREF(output);
3743 }
3744 else {
3745 arg = PyBytes_FromObject(arg);
3746 if (!arg)
3747 return 0;
3748 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3749 PyBytes_GET_SIZE(arg));
3750 Py_DECREF(arg);
3751 if (!output)
3752 return 0;
3753 if (!PyUnicode_Check(output)) {
3754 Py_DECREF(output);
3755 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3756 return 0;
3757 }
3758 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003759 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003760 Py_DECREF(output);
3761 return 0;
3762 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003763 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003764 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003765 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3766 Py_DECREF(output);
3767 return 0;
3768 }
3769 *(PyObject**)addr = output;
3770 return Py_CLEANUP_SUPPORTED;
3771}
3772
3773
Martin v. Löwis5b222132007-06-10 09:51:05 +00003774char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003776{
Christian Heimesf3863112007-11-22 07:46:41 +00003777 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003779 if (!PyUnicode_Check(unicode)) {
3780 PyErr_BadArgument();
3781 return NULL;
3782 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003783 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003784 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003786 if (PyUnicode_UTF8(unicode) == NULL) {
3787 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3789 if (bytes == NULL)
3790 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003791 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3792 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 Py_DECREF(bytes);
3794 return NULL;
3795 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003796 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3797 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3798 PyBytes_AS_STRING(bytes),
3799 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003800 Py_DECREF(bytes);
3801 }
3802
3803 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003804 *psize = PyUnicode_UTF8_LENGTH(unicode);
3805 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003806}
3807
3808char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3812}
3813
3814#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003815static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816#endif
3817
3818
3819Py_UNICODE *
3820PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3821{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822 const unsigned char *one_byte;
3823#if SIZEOF_WCHAR_T == 4
3824 const Py_UCS2 *two_bytes;
3825#else
3826 const Py_UCS4 *four_bytes;
3827 const Py_UCS4 *ucs4_end;
3828 Py_ssize_t num_surrogates;
3829#endif
3830 wchar_t *w;
3831 wchar_t *wchar_end;
3832
3833 if (!PyUnicode_Check(unicode)) {
3834 PyErr_BadArgument();
3835 return NULL;
3836 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003837 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003839 assert(_PyUnicode_KIND(unicode) != 0);
3840 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841
3842#ifdef Py_DEBUG
3843 ++unicode_as_unicode_calls;
3844#endif
3845
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003846 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003848 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3849 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850 num_surrogates = 0;
3851
3852 for (; four_bytes < ucs4_end; ++four_bytes) {
3853 if (*four_bytes > 0xFFFF)
3854 ++num_surrogates;
3855 }
3856
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003857 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3858 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3859 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860 PyErr_NoMemory();
3861 return NULL;
3862 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003863 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003865 w = _PyUnicode_WSTR(unicode);
3866 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3867 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3869 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003870 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003872 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3873 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 }
3875 else
3876 *w = *four_bytes;
3877
3878 if (w > wchar_end) {
3879 assert(0 && "Miscalculated string end");
3880 }
3881 }
3882 *w = 0;
3883#else
3884 /* sizeof(wchar_t) == 4 */
3885 Py_FatalError("Impossible unicode object state, wstr and str "
3886 "should share memory already.");
3887 return NULL;
3888#endif
3889 }
3890 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003891 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3892 (_PyUnicode_LENGTH(unicode) + 1));
3893 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894 PyErr_NoMemory();
3895 return NULL;
3896 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003897 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3898 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3899 w = _PyUnicode_WSTR(unicode);
3900 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003902 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3903 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003904 for (; w < wchar_end; ++one_byte, ++w)
3905 *w = *one_byte;
3906 /* null-terminate the wstr */
3907 *w = 0;
3908 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003909 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003910#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003911 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912 for (; w < wchar_end; ++two_bytes, ++w)
3913 *w = *two_bytes;
3914 /* null-terminate the wstr */
3915 *w = 0;
3916#else
3917 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003918 PyObject_FREE(_PyUnicode_WSTR(unicode));
3919 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920 Py_FatalError("Impossible unicode object state, wstr "
3921 "and str should share memory already.");
3922 return NULL;
3923#endif
3924 }
3925 else {
3926 assert(0 && "This should never happen.");
3927 }
3928 }
3929 }
3930 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003931 *size = PyUnicode_WSTR_LENGTH(unicode);
3932 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003933}
3934
Alexander Belopolsky40018472011-02-26 01:02:56 +00003935Py_UNICODE *
3936PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003938 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939}
3940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941
Alexander Belopolsky40018472011-02-26 01:02:56 +00003942Py_ssize_t
3943PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944{
3945 if (!PyUnicode_Check(unicode)) {
3946 PyErr_BadArgument();
3947 goto onError;
3948 }
3949 return PyUnicode_GET_SIZE(unicode);
3950
Benjamin Peterson29060642009-01-31 22:14:21 +00003951 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 return -1;
3953}
3954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955Py_ssize_t
3956PyUnicode_GetLength(PyObject *unicode)
3957{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003958 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959 PyErr_BadArgument();
3960 return -1;
3961 }
3962
3963 return PyUnicode_GET_LENGTH(unicode);
3964}
3965
3966Py_UCS4
3967PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3968{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003969 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3970 PyErr_BadArgument();
3971 return (Py_UCS4)-1;
3972 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003973 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003974 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975 return (Py_UCS4)-1;
3976 }
3977 return PyUnicode_READ_CHAR(unicode, index);
3978}
3979
3980int
3981PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3982{
3983 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003984 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985 return -1;
3986 }
Victor Stinner488fa492011-12-12 00:01:39 +01003987 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003988 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003989 PyErr_SetString(PyExc_IndexError, "string index out of range");
3990 return -1;
3991 }
Victor Stinner488fa492011-12-12 00:01:39 +01003992 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003993 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003994 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3995 PyErr_SetString(PyExc_ValueError, "character out of range");
3996 return -1;
3997 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3999 index, ch);
4000 return 0;
4001}
4002
Alexander Belopolsky40018472011-02-26 01:02:56 +00004003const char *
4004PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004005{
Victor Stinner42cb4622010-09-01 19:39:01 +00004006 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004007}
4008
Victor Stinner554f3f02010-06-16 23:33:54 +00004009/* create or adjust a UnicodeDecodeError */
4010static void
4011make_decode_exception(PyObject **exceptionObject,
4012 const char *encoding,
4013 const char *input, Py_ssize_t length,
4014 Py_ssize_t startpos, Py_ssize_t endpos,
4015 const char *reason)
4016{
4017 if (*exceptionObject == NULL) {
4018 *exceptionObject = PyUnicodeDecodeError_Create(
4019 encoding, input, length, startpos, endpos, reason);
4020 }
4021 else {
4022 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4023 goto onError;
4024 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4025 goto onError;
4026 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4027 goto onError;
4028 }
4029 return;
4030
4031onError:
4032 Py_DECREF(*exceptionObject);
4033 *exceptionObject = NULL;
4034}
4035
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036/* error handling callback helper:
4037 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004038 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039 and adjust various state variables.
4040 return 0 on success, -1 on error
4041*/
4042
Alexander Belopolsky40018472011-02-26 01:02:56 +00004043static int
4044unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004045 const char *encoding, const char *reason,
4046 const char **input, const char **inend, Py_ssize_t *startinpos,
4047 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004048 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004050 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051
4052 PyObject *restuple = NULL;
4053 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004054 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004055 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004056 Py_ssize_t requiredsize;
4057 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004058 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 int res = -1;
4060
Victor Stinner596a6c42011-11-09 00:02:18 +01004061 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4062 outsize = PyUnicode_GET_LENGTH(*output);
4063 else
4064 outsize = _PyUnicode_WSTR_LENGTH(*output);
4065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 *errorHandler = PyCodec_LookupError(errors);
4068 if (*errorHandler == NULL)
4069 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 }
4071
Victor Stinner554f3f02010-06-16 23:33:54 +00004072 make_decode_exception(exceptionObject,
4073 encoding,
4074 *input, *inend - *input,
4075 *startinpos, *endinpos,
4076 reason);
4077 if (*exceptionObject == NULL)
4078 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079
4080 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4081 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004084 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004085 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 }
4087 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004089 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004090 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004091
4092 /* Copy back the bytes variables, which might have been modified by the
4093 callback */
4094 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4095 if (!inputobj)
4096 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004097 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004098 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004099 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004100 *input = PyBytes_AS_STRING(inputobj);
4101 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004102 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004103 /* we can DECREF safely, as the exception has another reference,
4104 so the object won't go away. */
4105 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004106
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004108 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004109 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004110 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4111 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004112 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004113
Victor Stinner596a6c42011-11-09 00:02:18 +01004114 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4115 /* need more space? (at least enough for what we
4116 have+the replacement+the rest of the string (starting
4117 at the new input position), so we won't have to check space
4118 when there are no errors in the rest of the string) */
4119 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4120 requiredsize = *outpos + replen + insize-newpos;
4121 if (requiredsize > outsize) {
4122 if (requiredsize<2*outsize)
4123 requiredsize = 2*outsize;
4124 if (unicode_resize(output, requiredsize) < 0)
4125 goto onError;
4126 }
4127 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004128 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004129 copy_characters(*output, *outpos, repunicode, 0, replen);
4130 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004132 else {
4133 wchar_t *repwstr;
4134 Py_ssize_t repwlen;
4135 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4136 if (repwstr == NULL)
4137 goto onError;
4138 /* need more space? (at least enough for what we
4139 have+the replacement+the rest of the string (starting
4140 at the new input position), so we won't have to check space
4141 when there are no errors in the rest of the string) */
4142 requiredsize = *outpos + repwlen + insize-newpos;
4143 if (requiredsize > outsize) {
4144 if (requiredsize < 2*outsize)
4145 requiredsize = 2*outsize;
4146 if (unicode_resize(output, requiredsize) < 0)
4147 goto onError;
4148 }
4149 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4150 *outpos += repwlen;
4151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004153 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004154
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 /* we made it! */
4156 res = 0;
4157
Benjamin Peterson29060642009-01-31 22:14:21 +00004158 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159 Py_XDECREF(restuple);
4160 return res;
4161}
4162
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004163/* --- UTF-7 Codec -------------------------------------------------------- */
4164
Antoine Pitrou244651a2009-05-04 18:56:13 +00004165/* See RFC2152 for details. We encode conservatively and decode liberally. */
4166
4167/* Three simple macros defining base-64. */
4168
4169/* Is c a base-64 character? */
4170
4171#define IS_BASE64(c) \
4172 (((c) >= 'A' && (c) <= 'Z') || \
4173 ((c) >= 'a' && (c) <= 'z') || \
4174 ((c) >= '0' && (c) <= '9') || \
4175 (c) == '+' || (c) == '/')
4176
4177/* given that c is a base-64 character, what is its base-64 value? */
4178
4179#define FROM_BASE64(c) \
4180 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4181 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4182 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4183 (c) == '+' ? 62 : 63)
4184
4185/* What is the base-64 character of the bottom 6 bits of n? */
4186
4187#define TO_BASE64(n) \
4188 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4189
4190/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4191 * decoded as itself. We are permissive on decoding; the only ASCII
4192 * byte not decoding to itself is the + which begins a base64
4193 * string. */
4194
4195#define DECODE_DIRECT(c) \
4196 ((c) <= 127 && (c) != '+')
4197
4198/* The UTF-7 encoder treats ASCII characters differently according to
4199 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4200 * the above). See RFC2152. This array identifies these different
4201 * sets:
4202 * 0 : "Set D"
4203 * alphanumeric and '(),-./:?
4204 * 1 : "Set O"
4205 * !"#$%&*;<=>@[]^_`{|}
4206 * 2 : "whitespace"
4207 * ht nl cr sp
4208 * 3 : special (must be base64 encoded)
4209 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4210 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004211
Tim Petersced69f82003-09-16 20:30:58 +00004212static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004213char utf7_category[128] = {
4214/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4215 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4216/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4217 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4218/* sp ! " # $ % & ' ( ) * + , - . / */
4219 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4220/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4221 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4222/* @ A B C D E F G H I J K L M N O */
4223 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4224/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4226/* ` a b c d e f g h i j k l m n o */
4227 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4228/* p q r s t u v w x y z { | } ~ del */
4229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004230};
4231
Antoine Pitrou244651a2009-05-04 18:56:13 +00004232/* ENCODE_DIRECT: this character should be encoded as itself. The
4233 * answer depends on whether we are encoding set O as itself, and also
4234 * on whether we are encoding whitespace as itself. RFC2152 makes it
4235 * clear that the answers to these questions vary between
4236 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004237
Antoine Pitrou244651a2009-05-04 18:56:13 +00004238#define ENCODE_DIRECT(c, directO, directWS) \
4239 ((c) < 128 && (c) > 0 && \
4240 ((utf7_category[(c)] == 0) || \
4241 (directWS && (utf7_category[(c)] == 2)) || \
4242 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004243
Alexander Belopolsky40018472011-02-26 01:02:56 +00004244PyObject *
4245PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004246 Py_ssize_t size,
4247 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004248{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004249 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4250}
4251
Antoine Pitrou244651a2009-05-04 18:56:13 +00004252/* The decoder. The only state we preserve is our read position,
4253 * i.e. how many characters we have consumed. So if we end in the
4254 * middle of a shift sequence we have to back off the read position
4255 * and the output to the beginning of the sequence, otherwise we lose
4256 * all the shift state (seen bits, number of bits seen, high
4257 * surrogate). */
4258
Alexander Belopolsky40018472011-02-26 01:02:56 +00004259PyObject *
4260PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004261 Py_ssize_t size,
4262 const char *errors,
4263 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004264{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004266 Py_ssize_t startinpos;
4267 Py_ssize_t endinpos;
4268 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004269 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004270 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004271 const char *errmsg = "";
4272 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004273 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004274 unsigned int base64bits = 0;
4275 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004276 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 PyObject *errorHandler = NULL;
4278 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004279
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004280 /* Start off assuming it's all ASCII. Widen later as necessary. */
4281 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282 if (!unicode)
4283 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004284 if (size == 0) {
4285 if (consumed)
4286 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004287 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004288 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004289
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004290 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004291 e = s + size;
4292
4293 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004294 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004295 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004296 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004297
Antoine Pitrou244651a2009-05-04 18:56:13 +00004298 if (inShift) { /* in a base-64 section */
4299 if (IS_BASE64(ch)) { /* consume a base-64 character */
4300 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4301 base64bits += 6;
4302 s++;
4303 if (base64bits >= 16) {
4304 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004305 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004306 base64bits -= 16;
4307 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4308 if (surrogate) {
4309 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004310 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4311 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004312 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4313 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004315 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004316 }
4317 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004318 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4319 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004320 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004321 }
4322 }
Victor Stinner551ac952011-11-29 22:58:13 +01004323 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324 /* first surrogate */
4325 surrogate = outCh;
4326 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004327 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004328 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4329 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004330 }
4331 }
4332 }
4333 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334 inShift = 0;
4335 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004336 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004337 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4338 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004339 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004340 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 if (base64bits > 0) { /* left-over bits */
4342 if (base64bits >= 6) {
4343 /* We've seen at least one base-64 character */
4344 errmsg = "partial character in shift sequence";
4345 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004346 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004347 else {
4348 /* Some bits remain; they should be zero */
4349 if (base64buffer != 0) {
4350 errmsg = "non-zero padding bits in shift sequence";
4351 goto utf7Error;
4352 }
4353 }
4354 }
4355 if (ch != '-') {
4356 /* '-' is absorbed; other terminating
4357 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004358 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4359 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004360 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361 }
4362 }
4363 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 s++; /* consume '+' */
4366 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004367 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004368 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4369 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 }
4371 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004373 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004375 }
4376 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004378 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4379 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004380 s++;
4381 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 else {
4383 startinpos = s-starts;
4384 s++;
4385 errmsg = "unexpected special character";
4386 goto utf7Error;
4387 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004388 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004389utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390 endinpos = s-starts;
4391 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004392 errors, &errorHandler,
4393 "utf7", errmsg,
4394 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004395 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004397 }
4398
Antoine Pitrou244651a2009-05-04 18:56:13 +00004399 /* end of string */
4400
4401 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4402 /* if we're in an inconsistent state, that's an error */
4403 if (surrogate ||
4404 (base64bits >= 6) ||
4405 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406 endinpos = size;
4407 if (unicode_decode_call_errorhandler(
4408 errors, &errorHandler,
4409 "utf7", "unterminated shift sequence",
4410 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004411 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 goto onError;
4413 if (s < e)
4414 goto restart;
4415 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417
4418 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004419 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004421 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004422 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 }
4424 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004425 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004427 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004429 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004430 goto onError;
4431
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 Py_XDECREF(errorHandler);
4433 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004434 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004435
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 Py_XDECREF(errorHandler);
4438 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439 Py_DECREF(unicode);
4440 return NULL;
4441}
4442
4443
Alexander Belopolsky40018472011-02-26 01:02:56 +00004444PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004445_PyUnicode_EncodeUTF7(PyObject *str,
4446 int base64SetO,
4447 int base64WhiteSpace,
4448 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004449{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004450 int kind;
4451 void *data;
4452 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004453 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004454 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004455 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004456 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004457 unsigned int base64bits = 0;
4458 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459 char * out;
4460 char * start;
4461
Benjamin Petersonbac79492012-01-14 13:34:47 -05004462 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004463 return NULL;
4464 kind = PyUnicode_KIND(str);
4465 data = PyUnicode_DATA(str);
4466 len = PyUnicode_GET_LENGTH(str);
4467
4468 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004471 /* It might be possible to tighten this worst case */
4472 allocated = 8 * len;
4473 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004474 return PyErr_NoMemory();
4475
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004477 if (v == NULL)
4478 return NULL;
4479
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004480 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004481 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004482 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484 if (inShift) {
4485 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4486 /* shifting out */
4487 if (base64bits) { /* output remaining bits */
4488 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4489 base64buffer = 0;
4490 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004491 }
4492 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 /* Characters not in the BASE64 set implicitly unshift the sequence
4494 so no '-' is required, except if the character is itself a '-' */
4495 if (IS_BASE64(ch) || ch == '-') {
4496 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004497 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498 *out++ = (char) ch;
4499 }
4500 else {
4501 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004502 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004504 else { /* not in a shift sequence */
4505 if (ch == '+') {
4506 *out++ = '+';
4507 *out++ = '-';
4508 }
4509 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4510 *out++ = (char) ch;
4511 }
4512 else {
4513 *out++ = '+';
4514 inShift = 1;
4515 goto encode_char;
4516 }
4517 }
4518 continue;
4519encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004521 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004522
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523 /* code first surrogate */
4524 base64bits += 16;
4525 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4526 while (base64bits >= 6) {
4527 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4528 base64bits -= 6;
4529 }
4530 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004531 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004532 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 base64bits += 16;
4534 base64buffer = (base64buffer << 16) | ch;
4535 while (base64bits >= 6) {
4536 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4537 base64bits -= 6;
4538 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004539 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540 if (base64bits)
4541 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4542 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004543 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004544 if (_PyBytes_Resize(&v, out - start) < 0)
4545 return NULL;
4546 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004547}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004548PyObject *
4549PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4550 Py_ssize_t size,
4551 int base64SetO,
4552 int base64WhiteSpace,
4553 const char *errors)
4554{
4555 PyObject *result;
4556 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4557 if (tmp == NULL)
4558 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004559 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004560 base64WhiteSpace, errors);
4561 Py_DECREF(tmp);
4562 return result;
4563}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004564
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565#undef IS_BASE64
4566#undef FROM_BASE64
4567#undef TO_BASE64
4568#undef DECODE_DIRECT
4569#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004570
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571/* --- UTF-8 Codec -------------------------------------------------------- */
4572
Tim Petersced69f82003-09-16 20:30:58 +00004573static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004575 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4576 illegal prefix. See RFC 3629 for details */
4577 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4578 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004579 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4581 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4585 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4587 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004588 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4589 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4590 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4591 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4592 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593};
4594
Alexander Belopolsky40018472011-02-26 01:02:56 +00004595PyObject *
4596PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004597 Py_ssize_t size,
4598 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599{
Walter Dörwald69652032004-09-07 20:24:22 +00004600 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4601}
4602
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004603#include "stringlib/ucs1lib.h"
4604#include "stringlib/codecs.h"
4605#include "stringlib/undef.h"
4606
4607#include "stringlib/ucs2lib.h"
4608#include "stringlib/codecs.h"
4609#include "stringlib/undef.h"
4610
4611#include "stringlib/ucs4lib.h"
4612#include "stringlib/codecs.h"
4613#include "stringlib/undef.h"
4614
Antoine Pitrouab868312009-01-10 15:40:25 +00004615/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4616#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4617
4618/* Mask to quickly check whether a C 'long' contains a
4619 non-ASCII, UTF8-encoded char. */
4620#if (SIZEOF_LONG == 8)
4621# define ASCII_CHAR_MASK 0x8080808080808080L
4622#elif (SIZEOF_LONG == 4)
4623# define ASCII_CHAR_MASK 0x80808080L
4624#else
4625# error C 'long' size should be either 4 or 8!
4626#endif
4627
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004628/* Scans a UTF-8 string and returns the maximum character to be expected
4629 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004630
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004631 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004632 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004633 */
4634static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004635utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004637 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004638 const unsigned char *end = p + string_size;
4639 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004640
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004641 assert(unicode_size != NULL);
4642
4643 /* By having a cascade of independent loops which fallback onto each
4644 other, we minimize the amount of work done in the average loop
4645 iteration, and we also maximize the CPU's ability to predict
4646 branches correctly (because a given condition will have always the
4647 same boolean outcome except perhaps in the last iteration of the
4648 corresponding loop).
4649 In the general case this brings us rather close to decoding
4650 performance pre-PEP 393, despite the two-pass decoding.
4651
4652 Note that the pure ASCII loop is not duplicated once a non-ASCII
4653 character has been encountered. It is actually a pessimization (by
4654 a significant factor) to use this loop on text with many non-ASCII
4655 characters, and it is important to avoid bad performance on valid
4656 utf-8 data (invalid utf-8 being a different can of worms).
4657 */
4658
4659 /* ASCII */
4660 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004661 /* Only check value if it's not a ASCII char... */
4662 if (*p < 0x80) {
4663 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4664 an explanation. */
4665 if (!((size_t) p & LONG_PTR_MASK)) {
4666 /* Help register allocation */
4667 register const unsigned char *_p = p;
4668 while (_p < aligned_end) {
4669 unsigned long value = *(unsigned long *) _p;
4670 if (value & ASCII_CHAR_MASK)
4671 break;
4672 _p += SIZEOF_LONG;
4673 char_count += SIZEOF_LONG;
4674 }
4675 p = _p;
4676 if (p == end)
4677 break;
4678 }
4679 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004680 if (*p < 0x80)
4681 ++char_count;
4682 else
4683 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004684 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004685 *unicode_size = char_count;
4686 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004687
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004688_ucs1loop:
4689 for (; p < end; ++p) {
4690 if (*p < 0xc4)
4691 char_count += ((*p & 0xc0) != 0x80);
4692 else
4693 goto _ucs2loop;
4694 }
4695 *unicode_size = char_count;
4696 return 255;
4697
4698_ucs2loop:
4699 for (; p < end; ++p) {
4700 if (*p < 0xf0)
4701 char_count += ((*p & 0xc0) != 0x80);
4702 else
4703 goto _ucs4loop;
4704 }
4705 *unicode_size = char_count;
4706 return 65535;
4707
4708_ucs4loop:
4709 for (; p < end; ++p) {
4710 char_count += ((*p & 0xc0) != 0x80);
4711 }
4712 *unicode_size = char_count;
4713 return 65537;
4714}
4715
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004716/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004717 in case of errors. Implicit parameters: unicode, kind, data, onError.
4718 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004719*/
Victor Stinner785938e2011-12-11 20:09:03 +01004720#define WRITE_MAYBE_FAIL(index, value) \
4721 do { \
4722 Py_ssize_t pos = index; \
4723 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4724 unicode_resize(&unicode, pos + pos/8) < 0) \
4725 goto onError; \
4726 if (unicode_putchar(&unicode, &pos, value) < 0) \
4727 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004728 } while (0)
4729
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004730static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004731decode_utf8_errors(const char *starts,
4732 Py_ssize_t size,
4733 const char *errors,
4734 Py_ssize_t *consumed,
4735 const char *s,
4736 PyObject *unicode,
4737 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004738{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004740 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004741 Py_ssize_t startinpos;
4742 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004743 const char *e = starts + size;
4744 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004745 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004746 PyObject *errorHandler = NULL;
4747 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004748
Antoine Pitrouab868312009-01-10 15:40:25 +00004749 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750
4751 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004752 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753
4754 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004755 /* Fast path for runs of ASCII characters. Given that common UTF-8
4756 input will consist of an overwhelming majority of ASCII
4757 characters, we try to optimize for this case by checking
4758 as many characters as a C 'long' can contain.
4759 First, check if we can do an aligned read, as most CPUs have
4760 a penalty for unaligned reads.
4761 */
4762 if (!((size_t) s & LONG_PTR_MASK)) {
4763 /* Help register allocation */
4764 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004765 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004766 while (_s < aligned_end) {
4767 /* Read a whole long at a time (either 4 or 8 bytes),
4768 and do a fast unrolled copy if it only contains ASCII
4769 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004770 unsigned long value = *(unsigned long *) _s;
4771 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004772 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004773 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4774 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4775 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4776 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004777#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004778 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4779 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4780 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4781 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004782#endif
4783 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004784 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004785 }
4786 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004787 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004788 if (s == e)
4789 break;
4790 ch = (unsigned char)*s;
4791 }
4792 }
4793
4794 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004795 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796 s++;
4797 continue;
4798 }
4799
4800 n = utf8_code_length[ch];
4801
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004802 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 if (consumed)
4804 break;
4805 else {
4806 errmsg = "unexpected end of data";
4807 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004808 endinpos = startinpos+1;
4809 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4810 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004811 goto utf8Error;
4812 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004813 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814
4815 switch (n) {
4816
4817 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004818 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 startinpos = s-starts;
4820 endinpos = startinpos+1;
4821 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822
4823 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004824 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 startinpos = s-starts;
4826 endinpos = startinpos+1;
4827 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828
4829 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004830 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004831 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004832 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004833 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004834 goto utf8Error;
4835 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004837 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004838 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 break;
4840
4841 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004842 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4843 will result in surrogates in range d800-dfff. Surrogates are
4844 not valid UTF-8 so they are rejected.
4845 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4846 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004847 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004848 (s[2] & 0xc0) != 0x80 ||
4849 ((unsigned char)s[0] == 0xE0 &&
4850 (unsigned char)s[1] < 0xA0) ||
4851 ((unsigned char)s[0] == 0xED &&
4852 (unsigned char)s[1] > 0x9F)) {
4853 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004855 endinpos = startinpos + 1;
4856
4857 /* if s[1] first two bits are 1 and 0, then the invalid
4858 continuation byte is s[2], so increment endinpos by 1,
4859 if not, s[1] is invalid and endinpos doesn't need to
4860 be incremented. */
4861 if ((s[1] & 0xC0) == 0x80)
4862 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004863 goto utf8Error;
4864 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004866 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004867 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004868 break;
4869
4870 case 4:
4871 if ((s[1] & 0xc0) != 0x80 ||
4872 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004873 (s[3] & 0xc0) != 0x80 ||
4874 ((unsigned char)s[0] == 0xF0 &&
4875 (unsigned char)s[1] < 0x90) ||
4876 ((unsigned char)s[0] == 0xF4 &&
4877 (unsigned char)s[1] > 0x8F)) {
4878 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004879 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004880 endinpos = startinpos + 1;
4881 if ((s[1] & 0xC0) == 0x80) {
4882 endinpos++;
4883 if ((s[2] & 0xC0) == 0x80)
4884 endinpos++;
4885 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004886 goto utf8Error;
4887 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004888 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004889 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004890 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004891
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004892 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894 }
4895 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004896 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004897
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 if (unicode_decode_call_errorhandler(
4900 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004901 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004903 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004904 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004905 /* Update data because unicode_decode_call_errorhandler might have
4906 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908 }
Walter Dörwald69652032004-09-07 20:24:22 +00004909 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004912 /* Adjust length and ready string when it contained errors and
4913 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004914 if (unicode_resize(&unicode, i) < 0)
4915 goto onError;
4916 unicode_adjust_maxchar(&unicode);
4917 if (unicode == NULL)
4918 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920 Py_XDECREF(errorHandler);
4921 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004922 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004923 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 Py_XDECREF(errorHandler);
4927 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004928 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929 return NULL;
4930}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004931#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004932
Victor Stinner785938e2011-12-11 20:09:03 +01004933PyObject *
4934PyUnicode_DecodeUTF8Stateful(const char *s,
4935 Py_ssize_t size,
4936 const char *errors,
4937 Py_ssize_t *consumed)
4938{
4939 Py_UCS4 maxchar = 0;
4940 Py_ssize_t unicode_size;
4941 int has_errors = 0;
4942 PyObject *unicode;
4943 int kind;
4944 void *data;
4945 const char *starts = s;
4946 const char *e;
4947 Py_ssize_t i;
4948
4949 if (size == 0) {
4950 if (consumed)
4951 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004952 Py_INCREF(unicode_empty);
4953 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004954 }
4955
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004956 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004957
4958 /* When the string is ASCII only, just use memcpy and return.
4959 unicode_size may be != size if there is an incomplete UTF-8
4960 sequence at the end of the ASCII block. */
4961 if (maxchar < 128 && size == unicode_size) {
4962 if (consumed)
4963 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004964 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004965 }
4966
4967 unicode = PyUnicode_New(unicode_size, maxchar);
4968 if (!unicode)
4969 return NULL;
4970 kind = PyUnicode_KIND(unicode);
4971 data = PyUnicode_DATA(unicode);
4972
4973 /* Unpack UTF-8 encoded data */
4974 i = 0;
4975 e = starts + size;
4976 switch (kind) {
4977 case PyUnicode_1BYTE_KIND:
4978 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4979 break;
4980 case PyUnicode_2BYTE_KIND:
4981 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4982 break;
4983 case PyUnicode_4BYTE_KIND:
4984 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4985 break;
4986 }
4987 if (!has_errors) {
4988 /* Ensure the unicode size calculation was correct */
4989 assert(i == unicode_size);
4990 assert(s == e);
4991 if (consumed)
4992 *consumed = size;
4993 return unicode;
4994 }
4995
4996 /* In case of errors, maxchar and size computation might be incorrect;
4997 code below refits and resizes as necessary. */
4998 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4999}
5000
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005001#ifdef __APPLE__
5002
5003/* Simplified UTF-8 decoder using surrogateescape error handler,
5004 used to decode the command line arguments on Mac OS X. */
5005
5006wchar_t*
5007_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5008{
5009 int n;
5010 const char *e;
5011 wchar_t *unicode, *p;
5012
5013 /* Note: size will always be longer than the resulting Unicode
5014 character count */
5015 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
5016 PyErr_NoMemory();
5017 return NULL;
5018 }
5019 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
5020 if (!unicode)
5021 return NULL;
5022
5023 /* Unpack UTF-8 encoded data */
5024 p = unicode;
5025 e = s + size;
5026 while (s < e) {
5027 Py_UCS4 ch = (unsigned char)*s;
5028
5029 if (ch < 0x80) {
5030 *p++ = (wchar_t)ch;
5031 s++;
5032 continue;
5033 }
5034
5035 n = utf8_code_length[ch];
5036 if (s + n > e) {
5037 goto surrogateescape;
5038 }
5039
5040 switch (n) {
5041 case 0:
5042 case 1:
5043 goto surrogateescape;
5044
5045 case 2:
5046 if ((s[1] & 0xc0) != 0x80)
5047 goto surrogateescape;
5048 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5049 assert ((ch > 0x007F) && (ch <= 0x07FF));
5050 *p++ = (wchar_t)ch;
5051 break;
5052
5053 case 3:
5054 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5055 will result in surrogates in range d800-dfff. Surrogates are
5056 not valid UTF-8 so they are rejected.
5057 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5058 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5059 if ((s[1] & 0xc0) != 0x80 ||
5060 (s[2] & 0xc0) != 0x80 ||
5061 ((unsigned char)s[0] == 0xE0 &&
5062 (unsigned char)s[1] < 0xA0) ||
5063 ((unsigned char)s[0] == 0xED &&
5064 (unsigned char)s[1] > 0x9F)) {
5065
5066 goto surrogateescape;
5067 }
5068 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5069 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005070 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005071 break;
5072
5073 case 4:
5074 if ((s[1] & 0xc0) != 0x80 ||
5075 (s[2] & 0xc0) != 0x80 ||
5076 (s[3] & 0xc0) != 0x80 ||
5077 ((unsigned char)s[0] == 0xF0 &&
5078 (unsigned char)s[1] < 0x90) ||
5079 ((unsigned char)s[0] == 0xF4 &&
5080 (unsigned char)s[1] > 0x8F)) {
5081 goto surrogateescape;
5082 }
5083 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5084 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005085 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005086
5087#if SIZEOF_WCHAR_T == 4
5088 *p++ = (wchar_t)ch;
5089#else
5090 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005091 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5092 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005093#endif
5094 break;
5095 }
5096 s += n;
5097 continue;
5098
5099 surrogateescape:
5100 *p++ = 0xDC00 + ch;
5101 s++;
5102 }
5103 *p = L'\0';
5104 return unicode;
5105}
5106
5107#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005109/* Primary internal function which creates utf8 encoded bytes objects.
5110
5111 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005112 and allocate exactly as much space needed at the end. Else allocate the
5113 maximum possible needed (4 result bytes per Unicode character), and return
5114 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005115*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005116PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005117_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118{
Victor Stinner6099a032011-12-18 14:22:26 +01005119 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 void *data;
5121 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005123 if (!PyUnicode_Check(unicode)) {
5124 PyErr_BadArgument();
5125 return NULL;
5126 }
5127
5128 if (PyUnicode_READY(unicode) == -1)
5129 return NULL;
5130
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005131 if (PyUnicode_UTF8(unicode))
5132 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5133 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005134
5135 kind = PyUnicode_KIND(unicode);
5136 data = PyUnicode_DATA(unicode);
5137 size = PyUnicode_GET_LENGTH(unicode);
5138
Benjamin Petersonead6b532011-12-20 17:23:42 -06005139 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005140 default:
5141 assert(0);
5142 case PyUnicode_1BYTE_KIND:
5143 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5144 assert(!PyUnicode_IS_ASCII(unicode));
5145 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5146 case PyUnicode_2BYTE_KIND:
5147 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5148 case PyUnicode_4BYTE_KIND:
5149 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151}
5152
Alexander Belopolsky40018472011-02-26 01:02:56 +00005153PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005154PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5155 Py_ssize_t size,
5156 const char *errors)
5157{
5158 PyObject *v, *unicode;
5159
5160 unicode = PyUnicode_FromUnicode(s, size);
5161 if (unicode == NULL)
5162 return NULL;
5163 v = _PyUnicode_AsUTF8String(unicode, errors);
5164 Py_DECREF(unicode);
5165 return v;
5166}
5167
5168PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005169PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005171 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172}
5173
Walter Dörwald41980ca2007-08-16 21:55:45 +00005174/* --- UTF-32 Codec ------------------------------------------------------- */
5175
5176PyObject *
5177PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 Py_ssize_t size,
5179 const char *errors,
5180 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005181{
5182 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5183}
5184
5185PyObject *
5186PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005187 Py_ssize_t size,
5188 const char *errors,
5189 int *byteorder,
5190 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005191{
5192 const char *starts = s;
5193 Py_ssize_t startinpos;
5194 Py_ssize_t endinpos;
5195 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005196 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005197 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005198 int bo = 0; /* assume native ordering by default */
5199 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005200 /* Offsets from q for retrieving bytes in the right order. */
5201#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5202 int iorder[] = {0, 1, 2, 3};
5203#else
5204 int iorder[] = {3, 2, 1, 0};
5205#endif
5206 PyObject *errorHandler = NULL;
5207 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005208
Walter Dörwald41980ca2007-08-16 21:55:45 +00005209 q = (unsigned char *)s;
5210 e = q + size;
5211
5212 if (byteorder)
5213 bo = *byteorder;
5214
5215 /* Check for BOM marks (U+FEFF) in the input and adjust current
5216 byte order setting accordingly. In native mode, the leading BOM
5217 mark is skipped, in all other modes, it is copied to the output
5218 stream as-is (giving a ZWNBSP character). */
5219 if (bo == 0) {
5220 if (size >= 4) {
5221 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005223#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 if (bom == 0x0000FEFF) {
5225 q += 4;
5226 bo = -1;
5227 }
5228 else if (bom == 0xFFFE0000) {
5229 q += 4;
5230 bo = 1;
5231 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 if (bom == 0x0000FEFF) {
5234 q += 4;
5235 bo = 1;
5236 }
5237 else if (bom == 0xFFFE0000) {
5238 q += 4;
5239 bo = -1;
5240 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005241#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005243 }
5244
5245 if (bo == -1) {
5246 /* force LE */
5247 iorder[0] = 0;
5248 iorder[1] = 1;
5249 iorder[2] = 2;
5250 iorder[3] = 3;
5251 }
5252 else if (bo == 1) {
5253 /* force BE */
5254 iorder[0] = 3;
5255 iorder[1] = 2;
5256 iorder[2] = 1;
5257 iorder[3] = 0;
5258 }
5259
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005260 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005261 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005262 if (!unicode)
5263 return NULL;
5264 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005265 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005266 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005267
Walter Dörwald41980ca2007-08-16 21:55:45 +00005268 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 Py_UCS4 ch;
5270 /* remaining bytes at the end? (size should be divisible by 4) */
5271 if (e-q<4) {
5272 if (consumed)
5273 break;
5274 errmsg = "truncated data";
5275 startinpos = ((const char *)q)-starts;
5276 endinpos = ((const char *)e)-starts;
5277 goto utf32Error;
5278 /* The remaining input chars are ignored if the callback
5279 chooses to skip the input */
5280 }
5281 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5282 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005283
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 if (ch >= 0x110000)
5285 {
5286 errmsg = "codepoint not in range(0x110000)";
5287 startinpos = ((const char *)q)-starts;
5288 endinpos = startinpos+4;
5289 goto utf32Error;
5290 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005291 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5292 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 q += 4;
5294 continue;
5295 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 if (unicode_decode_call_errorhandler(
5297 errors, &errorHandler,
5298 "utf32", errmsg,
5299 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005300 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005302 }
5303
5304 if (byteorder)
5305 *byteorder = bo;
5306
5307 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005309
5310 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005311 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005312 goto onError;
5313
5314 Py_XDECREF(errorHandler);
5315 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005316 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005317
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005319 Py_DECREF(unicode);
5320 Py_XDECREF(errorHandler);
5321 Py_XDECREF(exc);
5322 return NULL;
5323}
5324
5325PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005326_PyUnicode_EncodeUTF32(PyObject *str,
5327 const char *errors,
5328 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005329{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005330 int kind;
5331 void *data;
5332 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005333 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005334 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005335 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005336 /* Offsets from p for storing byte pairs in the right order. */
5337#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5338 int iorder[] = {0, 1, 2, 3};
5339#else
5340 int iorder[] = {3, 2, 1, 0};
5341#endif
5342
Benjamin Peterson29060642009-01-31 22:14:21 +00005343#define STORECHAR(CH) \
5344 do { \
5345 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5346 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5347 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5348 p[iorder[0]] = (CH) & 0xff; \
5349 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005350 } while(0)
5351
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005352 if (!PyUnicode_Check(str)) {
5353 PyErr_BadArgument();
5354 return NULL;
5355 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005356 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005357 return NULL;
5358 kind = PyUnicode_KIND(str);
5359 data = PyUnicode_DATA(str);
5360 len = PyUnicode_GET_LENGTH(str);
5361
5362 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005363 bytesize = nsize * 4;
5364 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005366 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367 if (v == NULL)
5368 return NULL;
5369
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005370 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005373 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005374 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005375
5376 if (byteorder == -1) {
5377 /* force LE */
5378 iorder[0] = 0;
5379 iorder[1] = 1;
5380 iorder[2] = 2;
5381 iorder[3] = 3;
5382 }
5383 else if (byteorder == 1) {
5384 /* force BE */
5385 iorder[0] = 3;
5386 iorder[1] = 2;
5387 iorder[2] = 1;
5388 iorder[3] = 0;
5389 }
5390
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005391 for (i = 0; i < len; i++)
5392 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005393
5394 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005395 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005396#undef STORECHAR
5397}
5398
Alexander Belopolsky40018472011-02-26 01:02:56 +00005399PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005400PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5401 Py_ssize_t size,
5402 const char *errors,
5403 int byteorder)
5404{
5405 PyObject *result;
5406 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5407 if (tmp == NULL)
5408 return NULL;
5409 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5410 Py_DECREF(tmp);
5411 return result;
5412}
5413
5414PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005415PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005416{
Victor Stinnerb960b342011-11-20 19:12:52 +01005417 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005418}
5419
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420/* --- UTF-16 Codec ------------------------------------------------------- */
5421
Tim Peters772747b2001-08-09 22:21:55 +00005422PyObject *
5423PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 Py_ssize_t size,
5425 const char *errors,
5426 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427{
Walter Dörwald69652032004-09-07 20:24:22 +00005428 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5429}
5430
Antoine Pitrouab868312009-01-10 15:40:25 +00005431/* Two masks for fast checking of whether a C 'long' may contain
5432 UTF16-encoded surrogate characters. This is an efficient heuristic,
5433 assuming that non-surrogate characters with a code point >= 0x8000 are
5434 rare in most input.
5435 FAST_CHAR_MASK is used when the input is in native byte ordering,
5436 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005437*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005438#if (SIZEOF_LONG == 8)
5439# define FAST_CHAR_MASK 0x8000800080008000L
5440# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005441# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005442#elif (SIZEOF_LONG == 4)
5443# define FAST_CHAR_MASK 0x80008000L
5444# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005445# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005446#else
5447# error C 'long' size should be either 4 or 8!
5448#endif
5449
Walter Dörwald69652032004-09-07 20:24:22 +00005450PyObject *
5451PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005452 Py_ssize_t size,
5453 const char *errors,
5454 int *byteorder,
5455 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005456{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005458 Py_ssize_t startinpos;
5459 Py_ssize_t endinpos;
5460 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005461 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005462 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005463 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005464 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005465 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005466 /* Offsets from q for retrieving byte pairs in the right order. */
5467#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5468 int ihi = 1, ilo = 0;
5469#else
5470 int ihi = 0, ilo = 1;
5471#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005472 PyObject *errorHandler = NULL;
5473 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
5475 /* Note: size will always be longer than the resulting Unicode
5476 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005477 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 if (!unicode)
5479 return NULL;
5480 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005481 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005482 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483
Tim Peters772747b2001-08-09 22:21:55 +00005484 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005485 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486
5487 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005488 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005490 /* Check for BOM marks (U+FEFF) in the input and adjust current
5491 byte order setting accordingly. In native mode, the leading BOM
5492 mark is skipped, in all other modes, it is copied to the output
5493 stream as-is (giving a ZWNBSP character). */
5494 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005495 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005496 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005497#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 if (bom == 0xFEFF) {
5499 q += 2;
5500 bo = -1;
5501 }
5502 else if (bom == 0xFFFE) {
5503 q += 2;
5504 bo = 1;
5505 }
Tim Petersced69f82003-09-16 20:30:58 +00005506#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 if (bom == 0xFEFF) {
5508 q += 2;
5509 bo = 1;
5510 }
5511 else if (bom == 0xFFFE) {
5512 q += 2;
5513 bo = -1;
5514 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005515#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005517 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518
Tim Peters772747b2001-08-09 22:21:55 +00005519 if (bo == -1) {
5520 /* force LE */
5521 ihi = 1;
5522 ilo = 0;
5523 }
5524 else if (bo == 1) {
5525 /* force BE */
5526 ihi = 0;
5527 ilo = 1;
5528 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005529#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5530 native_ordering = ilo < ihi;
5531#else
5532 native_ordering = ilo > ihi;
5533#endif
Tim Peters772747b2001-08-09 22:21:55 +00005534
Antoine Pitrouab868312009-01-10 15:40:25 +00005535 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005536 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005537 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005538 /* First check for possible aligned read of a C 'long'. Unaligned
5539 reads are more expensive, better to defer to another iteration. */
5540 if (!((size_t) q & LONG_PTR_MASK)) {
5541 /* Fast path for runs of non-surrogate chars. */
5542 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005543 int kind = PyUnicode_KIND(unicode);
5544 void *data = PyUnicode_DATA(unicode);
5545 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005546 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005547 Py_UCS4 maxch;
5548 if (native_ordering) {
5549 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005550 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005551 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005552 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005553 else {
5554 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005555 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005556 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005557 block = ((block >> 8) & STRIPPED_MASK) |
5558 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005559 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005560 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005561#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005562 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
5563 maxch = Py_MAX(maxch, ch);
5564 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
5565 maxch = Py_MAX(maxch, ch);
5566 ch = (Py_UCS2)(block >> 48);
5567 maxch = Py_MAX(maxch, ch);
5568#else
5569 ch = (Py_UCS2)(block >> 16);
5570 maxch = Py_MAX(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005571#endif
5572 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5573 if (unicode_widen(&unicode, maxch) < 0)
5574 goto onError;
5575 kind = PyUnicode_KIND(unicode);
5576 data = PyUnicode_DATA(unicode);
5577 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005578#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5579 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005580#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005581 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5582 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5583 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5584#else
5585 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5586#endif
5587#else
5588#if SIZEOF_LONG == 8
5589 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5590 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5591 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5592#else
5593 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5594#endif
5595 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005596#endif
5597 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005598 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005599 q = _q;
5600 if (q >= e)
5601 break;
5602 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005604
Benjamin Peterson14339b62009-01-31 16:36:08 +00005605 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005606
Victor Stinner551ac952011-11-29 22:58:13 +01005607 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005608 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5609 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 continue;
5611 }
5612
5613 /* UTF-16 code pair: */
5614 if (q > e) {
5615 errmsg = "unexpected end of data";
5616 startinpos = (((const char *)q) - 2) - starts;
5617 endinpos = ((const char *)e) + 1 - starts;
5618 goto utf16Error;
5619 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005620 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5621 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005623 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005624 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005625 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005626 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 continue;
5628 }
5629 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005630 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 startinpos = (((const char *)q)-4)-starts;
5632 endinpos = startinpos+2;
5633 goto utf16Error;
5634 }
5635
Benjamin Peterson14339b62009-01-31 16:36:08 +00005636 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 errmsg = "illegal encoding";
5638 startinpos = (((const char *)q)-2)-starts;
5639 endinpos = startinpos+2;
5640 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005641
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005643 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005644 errors,
5645 &errorHandler,
5646 "utf16", errmsg,
5647 &starts,
5648 (const char **)&e,
5649 &startinpos,
5650 &endinpos,
5651 &exc,
5652 (const char **)&q,
5653 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005654 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005657 /* remaining byte at the end? (size should be even) */
5658 if (e == q) {
5659 if (!consumed) {
5660 errmsg = "truncated data";
5661 startinpos = ((const char *)q) - starts;
5662 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005663 if (unicode_decode_call_errorhandler(
5664 errors,
5665 &errorHandler,
5666 "utf16", errmsg,
5667 &starts,
5668 (const char **)&e,
5669 &startinpos,
5670 &endinpos,
5671 &exc,
5672 (const char **)&q,
5673 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005674 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005675 goto onError;
5676 /* The remaining input chars are ignored if the callback
5677 chooses to skip the input */
5678 }
5679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680
5681 if (byteorder)
5682 *byteorder = bo;
5683
Walter Dörwald69652032004-09-07 20:24:22 +00005684 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005686
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005688 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 goto onError;
5690
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005691 Py_XDECREF(errorHandler);
5692 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005693 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 Py_XDECREF(errorHandler);
5698 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 return NULL;
5700}
5701
Antoine Pitrouab868312009-01-10 15:40:25 +00005702#undef FAST_CHAR_MASK
5703#undef SWAPPED_FAST_CHAR_MASK
5704
Tim Peters772747b2001-08-09 22:21:55 +00005705PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005706_PyUnicode_EncodeUTF16(PyObject *str,
5707 const char *errors,
5708 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005710 int kind;
5711 void *data;
5712 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005713 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005714 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005715 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005716 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005717 /* Offsets from p for storing byte pairs in the right order. */
5718#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5719 int ihi = 1, ilo = 0;
5720#else
5721 int ihi = 0, ilo = 1;
5722#endif
5723
Benjamin Peterson29060642009-01-31 22:14:21 +00005724#define STORECHAR(CH) \
5725 do { \
5726 p[ihi] = ((CH) >> 8) & 0xff; \
5727 p[ilo] = (CH) & 0xff; \
5728 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005729 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005731 if (!PyUnicode_Check(str)) {
5732 PyErr_BadArgument();
5733 return NULL;
5734 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005735 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005736 return NULL;
5737 kind = PyUnicode_KIND(str);
5738 data = PyUnicode_DATA(str);
5739 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005740
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005741 pairs = 0;
5742 if (kind == PyUnicode_4BYTE_KIND)
5743 for (i = 0; i < len; i++)
5744 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5745 pairs++;
5746 /* 2 * (len + pairs + (byteorder == 0)) */
5747 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005749 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005750 bytesize = nsize * 2;
5751 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005753 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 if (v == NULL)
5755 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005757 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005760 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005761 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005762
5763 if (byteorder == -1) {
5764 /* force LE */
5765 ihi = 1;
5766 ilo = 0;
5767 }
5768 else if (byteorder == 1) {
5769 /* force BE */
5770 ihi = 0;
5771 ilo = 1;
5772 }
5773
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005774 for (i = 0; i < len; i++) {
5775 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5776 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005778 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5779 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 }
Tim Peters772747b2001-08-09 22:21:55 +00005781 STORECHAR(ch);
5782 if (ch2)
5783 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005784 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005785
5786 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005787 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005788#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789}
5790
Alexander Belopolsky40018472011-02-26 01:02:56 +00005791PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005792PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5793 Py_ssize_t size,
5794 const char *errors,
5795 int byteorder)
5796{
5797 PyObject *result;
5798 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5799 if (tmp == NULL)
5800 return NULL;
5801 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5802 Py_DECREF(tmp);
5803 return result;
5804}
5805
5806PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005807PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005809 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810}
5811
5812/* --- Unicode Escape Codec ----------------------------------------------- */
5813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005814/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5815 if all the escapes in the string make it still a valid ASCII string.
5816 Returns -1 if any escapes were found which cause the string to
5817 pop out of ASCII range. Otherwise returns the length of the
5818 required buffer to hold the string.
5819 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005820static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005821length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5822{
5823 const unsigned char *p = (const unsigned char *)s;
5824 const unsigned char *end = p + size;
5825 Py_ssize_t length = 0;
5826
5827 if (size < 0)
5828 return -1;
5829
5830 for (; p < end; ++p) {
5831 if (*p > 127) {
5832 /* Non-ASCII */
5833 return -1;
5834 }
5835 else if (*p != '\\') {
5836 /* Normal character */
5837 ++length;
5838 }
5839 else {
5840 /* Backslash-escape, check next char */
5841 ++p;
5842 /* Escape sequence reaches till end of string or
5843 non-ASCII follow-up. */
5844 if (p >= end || *p > 127)
5845 return -1;
5846 switch (*p) {
5847 case '\n':
5848 /* backslash + \n result in zero characters */
5849 break;
5850 case '\\': case '\'': case '\"':
5851 case 'b': case 'f': case 't':
5852 case 'n': case 'r': case 'v': case 'a':
5853 ++length;
5854 break;
5855 case '0': case '1': case '2': case '3':
5856 case '4': case '5': case '6': case '7':
5857 case 'x': case 'u': case 'U': case 'N':
5858 /* these do not guarantee ASCII characters */
5859 return -1;
5860 default:
5861 /* count the backslash + the other character */
5862 length += 2;
5863 }
5864 }
5865 }
5866 return length;
5867}
5868
Fredrik Lundh06d12682001-01-24 07:59:11 +00005869static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005870
Alexander Belopolsky40018472011-02-26 01:02:56 +00005871PyObject *
5872PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005873 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005874 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005877 Py_ssize_t startinpos;
5878 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005879 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005880 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005882 char* message;
5883 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884 PyObject *errorHandler = NULL;
5885 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005886 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005887 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005888
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005889 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005890
5891 /* After length_of_escaped_ascii_string() there are two alternatives,
5892 either the string is pure ASCII with named escapes like \n, etc.
5893 and we determined it's exact size (common case)
5894 or it contains \x, \u, ... escape sequences. then we create a
5895 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005896 if (len >= 0) {
5897 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005898 if (!v)
5899 goto onError;
5900 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005901 }
5902 else {
5903 /* Escaped strings will always be longer than the resulting
5904 Unicode string, so we start with size here and then reduce the
5905 length after conversion to the true value.
5906 (but if the error callback returns a long replacement string
5907 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005908 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005909 if (!v)
5910 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005911 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005912 }
5913
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005915 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005916 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005918
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 while (s < end) {
5920 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005921 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005922 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005924 /* The only case in which i == ascii_length is a backslash
5925 followed by a newline. */
5926 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005927
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 /* Non-escape characters are interpreted as Unicode ordinals */
5929 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005930 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5931 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 continue;
5933 }
5934
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005935 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 /* \ - Escapes */
5937 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005938 c = *s++;
5939 if (s > end)
5940 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005941
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005942 /* The only case in which i == ascii_length is a backslash
5943 followed by a newline. */
5944 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005945
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005946 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005949#define WRITECHAR(ch) \
5950 do { \
5951 if (unicode_putchar(&v, &i, ch) < 0) \
5952 goto onError; \
5953 }while(0)
5954
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005956 case '\\': WRITECHAR('\\'); break;
5957 case '\'': WRITECHAR('\''); break;
5958 case '\"': WRITECHAR('\"'); break;
5959 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005960 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005961 case 'f': WRITECHAR('\014'); break;
5962 case 't': WRITECHAR('\t'); break;
5963 case 'n': WRITECHAR('\n'); break;
5964 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005965 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005966 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005967 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005968 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 case '0': case '1': case '2': case '3':
5972 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005973 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005974 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005975 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005976 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005977 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005979 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 break;
5981
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 /* hex escapes */
5983 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005985 digits = 2;
5986 message = "truncated \\xXX escape";
5987 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005991 digits = 4;
5992 message = "truncated \\uXXXX escape";
5993 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005996 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005997 digits = 8;
5998 message = "truncated \\UXXXXXXXX escape";
5999 hexescape:
6000 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006001 if (s+digits>end) {
6002 endinpos = size;
6003 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 errors, &errorHandler,
6005 "unicodeescape", "end of string in escape sequence",
6006 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006007 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006008 goto onError;
6009 goto nextByte;
6010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006011 for (j = 0; j < digits; ++j) {
6012 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00006013 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006014 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 errors, &errorHandler,
6017 "unicodeescape", message,
6018 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006019 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006020 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006021 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006023 }
6024 chr = (chr<<4) & ~0xF;
6025 if (c >= '0' && c <= '9')
6026 chr += c - '0';
6027 else if (c >= 'a' && c <= 'f')
6028 chr += 10 + c - 'a';
6029 else
6030 chr += 10 + c - 'A';
6031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006032 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006033 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006034 /* _decoding_error will have already written into the
6035 target buffer. */
6036 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006037 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006038 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01006039 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006040 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00006041 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 errors, &errorHandler,
6045 "unicodeescape", "illegal Unicode character",
6046 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006047 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006048 goto onError;
6049 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006050 break;
6051
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006053 case 'N':
6054 message = "malformed \\N character escape";
6055 if (ucnhash_CAPI == NULL) {
6056 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006057 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6058 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006059 if (ucnhash_CAPI == NULL)
6060 goto ucnhashError;
6061 }
6062 if (*s == '{') {
6063 const char *start = s+1;
6064 /* look for the closing brace */
6065 while (*s != '}' && s < end)
6066 s++;
6067 if (s > start && s < end && *s == '}') {
6068 /* found a name. look it up in the unicode database */
6069 message = "unknown Unicode character name";
6070 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006071 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006072 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073 goto store;
6074 }
6075 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006077 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 errors, &errorHandler,
6079 "unicodeescape", message,
6080 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006081 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006082 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006083 break;
6084
6085 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006086 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087 message = "\\ at end of string";
6088 s--;
6089 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006090 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 errors, &errorHandler,
6092 "unicodeescape", message,
6093 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006094 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006095 goto onError;
6096 }
6097 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006098 WRITECHAR('\\');
6099 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006100 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006101 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006104 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006106#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006107
Victor Stinner16e6a802011-12-12 13:24:15 +01006108 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006109 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006110 Py_XDECREF(errorHandler);
6111 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006112 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006113
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006115 PyErr_SetString(
6116 PyExc_UnicodeError,
6117 "\\N escapes not supported (can't load unicodedata module)"
6118 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006119 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 Py_XDECREF(errorHandler);
6121 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006122 return NULL;
6123
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 Py_XDECREF(errorHandler);
6127 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 return NULL;
6129}
6130
6131/* Return a Unicode-Escape string version of the Unicode object.
6132
6133 If quotes is true, the string is enclosed in u"" or u'' quotes as
6134 appropriate.
6135
6136*/
6137
Alexander Belopolsky40018472011-02-26 01:02:56 +00006138PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006139PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006141 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006142 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006144 int kind;
6145 void *data;
6146 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147
Thomas Wouters89f507f2006-12-13 04:49:30 +00006148 /* Initial allocation is based on the longest-possible unichr
6149 escape.
6150
6151 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6152 unichr, so in this case it's the longest unichr escape. In
6153 narrow (UTF-16) builds this is five chars per source unichr
6154 since there are two unichrs in the surrogate pair, so in narrow
6155 (UTF-16) builds it's not the longest unichr escape.
6156
6157 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6158 so in the narrow (UTF-16) build case it's the longest unichr
6159 escape.
6160 */
6161
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 if (!PyUnicode_Check(unicode)) {
6163 PyErr_BadArgument();
6164 return NULL;
6165 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006166 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167 return NULL;
6168 len = PyUnicode_GET_LENGTH(unicode);
6169 kind = PyUnicode_KIND(unicode);
6170 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006171 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6173 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6174 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6175 }
6176
6177 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006178 return PyBytes_FromStringAndSize(NULL, 0);
6179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006182
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006183 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 if (repr == NULL)
6188 return NULL;
6189
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006190 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006192 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006193 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006194
Walter Dörwald79e913e2007-05-12 11:08:06 +00006195 /* Escape backslashes */
6196 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 *p++ = '\\';
6198 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006199 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006200 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006201
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006202 /* Map 21-bit characters to '\U00xxxxxx' */
6203 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006204 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006205 *p++ = '\\';
6206 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006207 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6208 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6209 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6210 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6211 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6212 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6213 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6214 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006216 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006217
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006219 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 *p++ = '\\';
6221 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006222 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6223 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6224 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6225 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006227
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006228 /* Map special whitespace to '\t', \n', '\r' */
6229 else if (ch == '\t') {
6230 *p++ = '\\';
6231 *p++ = 't';
6232 }
6233 else if (ch == '\n') {
6234 *p++ = '\\';
6235 *p++ = 'n';
6236 }
6237 else if (ch == '\r') {
6238 *p++ = '\\';
6239 *p++ = 'r';
6240 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006241
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006242 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006243 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006245 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006246 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6247 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006248 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006249
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 /* Copy everything else as-is */
6251 else
6252 *p++ = (char) ch;
6253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006255 assert(p - PyBytes_AS_STRING(repr) > 0);
6256 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6257 return NULL;
6258 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259}
6260
Alexander Belopolsky40018472011-02-26 01:02:56 +00006261PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006262PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6263 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006265 PyObject *result;
6266 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6267 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006269 result = PyUnicode_AsUnicodeEscapeString(tmp);
6270 Py_DECREF(tmp);
6271 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272}
6273
6274/* --- Raw Unicode Escape Codec ------------------------------------------- */
6275
Alexander Belopolsky40018472011-02-26 01:02:56 +00006276PyObject *
6277PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006278 Py_ssize_t size,
6279 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006282 Py_ssize_t startinpos;
6283 Py_ssize_t endinpos;
6284 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006285 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 const char *end;
6287 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006288 PyObject *errorHandler = NULL;
6289 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006290
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 /* Escaped strings will always be longer than the resulting
6292 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293 length after conversion to the true value. (But decoding error
6294 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006295 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006299 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006300 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 end = s + size;
6302 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 unsigned char c;
6304 Py_UCS4 x;
6305 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006306 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 /* Non-escape characters are interpreted as Unicode ordinals */
6309 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006310 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6311 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006313 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 startinpos = s-starts;
6315
6316 /* \u-escapes are only interpreted iff the number of leading
6317 backslashes if odd */
6318 bs = s;
6319 for (;s < end;) {
6320 if (*s != '\\')
6321 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006322 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6323 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 }
6325 if (((s - bs) & 1) == 0 ||
6326 s >= end ||
6327 (*s != 'u' && *s != 'U')) {
6328 continue;
6329 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006330 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 count = *s=='u' ? 4 : 8;
6332 s++;
6333
6334 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 for (x = 0, i = 0; i < count; ++i, ++s) {
6336 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006337 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 endinpos = s-starts;
6339 if (unicode_decode_call_errorhandler(
6340 errors, &errorHandler,
6341 "rawunicodeescape", "truncated \\uXXXX",
6342 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006343 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 goto onError;
6345 goto nextByte;
6346 }
6347 x = (x<<4) & ~0xF;
6348 if (c >= '0' && c <= '9')
6349 x += c - '0';
6350 else if (c >= 'a' && c <= 'f')
6351 x += 10 + c - 'a';
6352 else
6353 x += 10 + c - 'A';
6354 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006355 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006356 if (unicode_putchar(&v, &outpos, x) < 0)
6357 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006358 } else {
6359 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006360 if (unicode_decode_call_errorhandler(
6361 errors, &errorHandler,
6362 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006364 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006366 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 nextByte:
6368 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006370 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006372 Py_XDECREF(errorHandler);
6373 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006374 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006375
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006378 Py_XDECREF(errorHandler);
6379 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 return NULL;
6381}
6382
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006383
Alexander Belopolsky40018472011-02-26 01:02:56 +00006384PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006385PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006387 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 char *p;
6389 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006390 Py_ssize_t expandsize, pos;
6391 int kind;
6392 void *data;
6393 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006395 if (!PyUnicode_Check(unicode)) {
6396 PyErr_BadArgument();
6397 return NULL;
6398 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006399 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006400 return NULL;
6401 kind = PyUnicode_KIND(unicode);
6402 data = PyUnicode_DATA(unicode);
6403 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006404 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6405 bytes, and 1 byte characters 4. */
6406 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006407
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006408 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006410
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 if (repr == NULL)
6413 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006414 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006415 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006417 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006418 for (pos = 0; pos < len; pos++) {
6419 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 /* Map 32-bit characters to '\Uxxxxxxxx' */
6421 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006422 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006423 *p++ = '\\';
6424 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006425 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6426 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6427 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6428 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6429 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6430 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6431 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6432 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006433 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006435 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 *p++ = '\\';
6437 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006438 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6439 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6440 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6441 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 /* Copy everything else as-is */
6444 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 *p++ = (char) ch;
6446 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006447
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006448 assert(p > q);
6449 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006450 return NULL;
6451 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452}
6453
Alexander Belopolsky40018472011-02-26 01:02:56 +00006454PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006455PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6456 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006458 PyObject *result;
6459 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6460 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006461 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006462 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6463 Py_DECREF(tmp);
6464 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465}
6466
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006467/* --- Unicode Internal Codec ------------------------------------------- */
6468
Alexander Belopolsky40018472011-02-26 01:02:56 +00006469PyObject *
6470_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006471 Py_ssize_t size,
6472 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006473{
6474 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006475 Py_ssize_t startinpos;
6476 Py_ssize_t endinpos;
6477 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006478 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006479 const char *end;
6480 const char *reason;
6481 PyObject *errorHandler = NULL;
6482 PyObject *exc = NULL;
6483
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006484 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006485 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006486 1))
6487 return NULL;
6488
Thomas Wouters89f507f2006-12-13 04:49:30 +00006489 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006490 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006491 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006493 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006494 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006495 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006496 end = s + size;
6497
6498 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006499 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006500 Py_UCS4 ch;
6501 /* We copy the raw representation one byte at a time because the
6502 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006503 ((char *) &uch)[0] = s[0];
6504 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006505#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006506 ((char *) &uch)[2] = s[2];
6507 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006508#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006509 ch = uch;
6510
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006511 /* We have to sanity check the raw data, otherwise doom looms for
6512 some malformed UCS-4 data. */
6513 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006514#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006515 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006516#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006517 end-s < Py_UNICODE_SIZE
6518 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006520 startinpos = s - starts;
6521 if (end-s < Py_UNICODE_SIZE) {
6522 endinpos = end-starts;
6523 reason = "truncated input";
6524 }
6525 else {
6526 endinpos = s - starts + Py_UNICODE_SIZE;
6527 reason = "illegal code point (> 0x10FFFF)";
6528 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006529 if (unicode_decode_call_errorhandler(
6530 errors, &errorHandler,
6531 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006532 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006533 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006534 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006535 continue;
6536 }
6537
6538 s += Py_UNICODE_SIZE;
6539#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006540 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006541 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006542 Py_UNICODE uch2;
6543 ((char *) &uch2)[0] = s[0];
6544 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006545 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006546 {
Victor Stinner551ac952011-11-29 22:58:13 +01006547 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006548 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006549 }
6550 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006551#endif
6552
6553 if (unicode_putchar(&v, &outpos, ch) < 0)
6554 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006555 }
6556
Victor Stinner16e6a802011-12-12 13:24:15 +01006557 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006558 goto onError;
6559 Py_XDECREF(errorHandler);
6560 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006561 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006562
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006564 Py_XDECREF(v);
6565 Py_XDECREF(errorHandler);
6566 Py_XDECREF(exc);
6567 return NULL;
6568}
6569
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570/* --- Latin-1 Codec ------------------------------------------------------ */
6571
Alexander Belopolsky40018472011-02-26 01:02:56 +00006572PyObject *
6573PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006574 Py_ssize_t size,
6575 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006578 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579}
6580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006581/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006582static void
6583make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006584 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006585 PyObject *unicode,
6586 Py_ssize_t startpos, Py_ssize_t endpos,
6587 const char *reason)
6588{
6589 if (*exceptionObject == NULL) {
6590 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006591 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006592 encoding, unicode, startpos, endpos, reason);
6593 }
6594 else {
6595 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6596 goto onError;
6597 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6598 goto onError;
6599 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6600 goto onError;
6601 return;
6602 onError:
6603 Py_DECREF(*exceptionObject);
6604 *exceptionObject = NULL;
6605 }
6606}
6607
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006609static void
6610raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006611 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006612 PyObject *unicode,
6613 Py_ssize_t startpos, Py_ssize_t endpos,
6614 const char *reason)
6615{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006616 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006617 encoding, unicode, startpos, endpos, reason);
6618 if (*exceptionObject != NULL)
6619 PyCodec_StrictErrors(*exceptionObject);
6620}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006621
6622/* error handling callback helper:
6623 build arguments, call the callback and check the arguments,
6624 put the result into newpos and return the replacement string, which
6625 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006626static PyObject *
6627unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006628 PyObject **errorHandler,
6629 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006630 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006631 Py_ssize_t startpos, Py_ssize_t endpos,
6632 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006633{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006634 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006635 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636 PyObject *restuple;
6637 PyObject *resunicode;
6638
6639 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006643 }
6644
Benjamin Petersonbac79492012-01-14 13:34:47 -05006645 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006646 return NULL;
6647 len = PyUnicode_GET_LENGTH(unicode);
6648
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006649 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006650 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006652 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653
6654 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006656 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006659 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 Py_DECREF(restuple);
6661 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006663 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 &resunicode, newpos)) {
6665 Py_DECREF(restuple);
6666 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006668 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6669 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6670 Py_DECREF(restuple);
6671 return NULL;
6672 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006673 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 *newpos = len + *newpos;
6675 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6677 Py_DECREF(restuple);
6678 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006680 Py_INCREF(resunicode);
6681 Py_DECREF(restuple);
6682 return resunicode;
6683}
6684
Alexander Belopolsky40018472011-02-26 01:02:56 +00006685static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006686unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006687 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006688 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006690 /* input state */
6691 Py_ssize_t pos=0, size;
6692 int kind;
6693 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 /* output object */
6695 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006696 /* pointer into the output */
6697 char *str;
6698 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006699 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006700 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6701 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702 PyObject *errorHandler = NULL;
6703 PyObject *exc = NULL;
6704 /* the following variable is used for caching string comparisons
6705 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6706 int known_errorHandler = -1;
6707
Benjamin Petersonbac79492012-01-14 13:34:47 -05006708 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006709 return NULL;
6710 size = PyUnicode_GET_LENGTH(unicode);
6711 kind = PyUnicode_KIND(unicode);
6712 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 /* allocate enough for a simple encoding without
6714 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006715 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006716 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006717 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006719 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006720 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 ressize = size;
6722
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006723 while (pos < size) {
6724 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 /* can we encode this? */
6727 if (c<limit) {
6728 /* no overflow check, because we know that the space is enough */
6729 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006730 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006731 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 Py_ssize_t requiredsize;
6734 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006737 Py_ssize_t collstart = pos;
6738 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006740 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 ++collend;
6742 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6743 if (known_errorHandler==-1) {
6744 if ((errors==NULL) || (!strcmp(errors, "strict")))
6745 known_errorHandler = 1;
6746 else if (!strcmp(errors, "replace"))
6747 known_errorHandler = 2;
6748 else if (!strcmp(errors, "ignore"))
6749 known_errorHandler = 3;
6750 else if (!strcmp(errors, "xmlcharrefreplace"))
6751 known_errorHandler = 4;
6752 else
6753 known_errorHandler = 0;
6754 }
6755 switch (known_errorHandler) {
6756 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006757 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 goto onError;
6759 case 2: /* replace */
6760 while (collstart++<collend)
6761 *str++ = '?'; /* fall through */
6762 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006763 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 break;
6765 case 4: /* xmlcharrefreplace */
6766 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006767 /* determine replacement size */
6768 for (i = collstart, repsize = 0; i < collend; ++i) {
6769 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6770 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006772 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006774 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006776 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006778 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006780 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006782 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006783 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006785 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006787 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 if (requiredsize > ressize) {
6789 if (requiredsize<2*ressize)
6790 requiredsize = 2*ressize;
6791 if (_PyBytes_Resize(&res, requiredsize))
6792 goto onError;
6793 str = PyBytes_AS_STRING(res) + respos;
6794 ressize = requiredsize;
6795 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006796 /* generate replacement */
6797 for (i = collstart; i < collend; ++i) {
6798 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006800 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 break;
6802 default:
6803 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006804 encoding, reason, unicode, &exc,
6805 collstart, collend, &newpos);
6806 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006807 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006809 if (PyBytes_Check(repunicode)) {
6810 /* Directly copy bytes result to output. */
6811 repsize = PyBytes_Size(repunicode);
6812 if (repsize > 1) {
6813 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006814 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006815 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6816 Py_DECREF(repunicode);
6817 goto onError;
6818 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006819 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006820 ressize += repsize-1;
6821 }
6822 memcpy(str, PyBytes_AsString(repunicode), repsize);
6823 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006824 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006825 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006826 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006827 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 /* need more space? (at least enough for what we
6829 have+the replacement+the rest of the string, so
6830 we won't have to check space for encodable characters) */
6831 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006832 repsize = PyUnicode_GET_LENGTH(repunicode);
6833 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 if (requiredsize > ressize) {
6835 if (requiredsize<2*ressize)
6836 requiredsize = 2*ressize;
6837 if (_PyBytes_Resize(&res, requiredsize)) {
6838 Py_DECREF(repunicode);
6839 goto onError;
6840 }
6841 str = PyBytes_AS_STRING(res) + respos;
6842 ressize = requiredsize;
6843 }
6844 /* check if there is anything unencodable in the replacement
6845 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006846 for (i = 0; repsize-->0; ++i, ++str) {
6847 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006849 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006850 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 Py_DECREF(repunicode);
6852 goto onError;
6853 }
6854 *str = (char)c;
6855 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006856 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006857 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006858 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006859 }
6860 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006861 /* Resize if we allocated to much */
6862 size = str - PyBytes_AS_STRING(res);
6863 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006864 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006865 if (_PyBytes_Resize(&res, size) < 0)
6866 goto onError;
6867 }
6868
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006869 Py_XDECREF(errorHandler);
6870 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006871 return res;
6872
6873 onError:
6874 Py_XDECREF(res);
6875 Py_XDECREF(errorHandler);
6876 Py_XDECREF(exc);
6877 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006878}
6879
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006880/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006881PyObject *
6882PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006883 Py_ssize_t size,
6884 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006886 PyObject *result;
6887 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6888 if (unicode == NULL)
6889 return NULL;
6890 result = unicode_encode_ucs1(unicode, errors, 256);
6891 Py_DECREF(unicode);
6892 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893}
6894
Alexander Belopolsky40018472011-02-26 01:02:56 +00006895PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006896_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897{
6898 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 PyErr_BadArgument();
6900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006902 if (PyUnicode_READY(unicode) == -1)
6903 return NULL;
6904 /* Fast path: if it is a one-byte string, construct
6905 bytes object directly. */
6906 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6907 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6908 PyUnicode_GET_LENGTH(unicode));
6909 /* Non-Latin-1 characters present. Defer to above function to
6910 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006911 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006912}
6913
6914PyObject*
6915PyUnicode_AsLatin1String(PyObject *unicode)
6916{
6917 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918}
6919
6920/* --- 7-bit ASCII Codec -------------------------------------------------- */
6921
Alexander Belopolsky40018472011-02-26 01:02:56 +00006922PyObject *
6923PyUnicode_DecodeASCII(const char *s,
6924 Py_ssize_t size,
6925 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006927 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006928 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006929 int kind;
6930 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006931 Py_ssize_t startinpos;
6932 Py_ssize_t endinpos;
6933 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006934 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006935 int has_error;
6936 const unsigned char *p = (const unsigned char *)s;
6937 const unsigned char *end = p + size;
6938 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939 PyObject *errorHandler = NULL;
6940 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006941
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006942 if (size == 0) {
6943 Py_INCREF(unicode_empty);
6944 return unicode_empty;
6945 }
6946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006948 if (size == 1 && (unsigned char)s[0] < 128)
6949 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006950
Victor Stinner702c7342011-10-05 13:50:52 +02006951 has_error = 0;
6952 while (p < end && !has_error) {
6953 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6954 an explanation. */
6955 if (!((size_t) p & LONG_PTR_MASK)) {
6956 /* Help register allocation */
6957 register const unsigned char *_p = p;
6958 while (_p < aligned_end) {
6959 unsigned long value = *(unsigned long *) _p;
6960 if (value & ASCII_CHAR_MASK) {
6961 has_error = 1;
6962 break;
6963 }
6964 _p += SIZEOF_LONG;
6965 }
6966 if (_p == end)
6967 break;
6968 if (has_error)
6969 break;
6970 p = _p;
6971 }
6972 if (*p & 0x80) {
6973 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006974 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006975 }
6976 else {
6977 ++p;
6978 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006979 }
Victor Stinner702c7342011-10-05 13:50:52 +02006980 if (!has_error)
6981 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006982
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006983 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006987 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006988 kind = PyUnicode_KIND(v);
6989 data = PyUnicode_DATA(v);
6990 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006991 e = s + size;
6992 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 register unsigned char c = (unsigned char)*s;
6994 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006995 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006996 ++s;
6997 }
6998 else {
6999 startinpos = s-starts;
7000 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007001 if (unicode_decode_call_errorhandler(
7002 errors, &errorHandler,
7003 "ascii", "ordinal not in range(128)",
7004 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007005 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007007 kind = PyUnicode_KIND(v);
7008 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007011 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007012 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 Py_XDECREF(errorHandler);
7014 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007015 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007016 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007017
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007020 Py_XDECREF(errorHandler);
7021 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 return NULL;
7023}
7024
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007025/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007026PyObject *
7027PyUnicode_EncodeASCII(const Py_UNICODE *p,
7028 Py_ssize_t size,
7029 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007031 PyObject *result;
7032 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7033 if (unicode == NULL)
7034 return NULL;
7035 result = unicode_encode_ucs1(unicode, errors, 128);
7036 Py_DECREF(unicode);
7037 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038}
7039
Alexander Belopolsky40018472011-02-26 01:02:56 +00007040PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007041_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042{
7043 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 PyErr_BadArgument();
7045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007047 if (PyUnicode_READY(unicode) == -1)
7048 return NULL;
7049 /* Fast path: if it is an ASCII-only string, construct bytes object
7050 directly. Else defer to above function to raise the exception. */
7051 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7052 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7053 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007054 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007055}
7056
7057PyObject *
7058PyUnicode_AsASCIIString(PyObject *unicode)
7059{
7060 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061}
7062
Victor Stinner99b95382011-07-04 14:23:54 +02007063#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007064
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007065/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007066
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007067#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007068#define NEED_RETRY
7069#endif
7070
Victor Stinner3a50e702011-10-18 21:21:00 +02007071#ifndef WC_ERR_INVALID_CHARS
7072# define WC_ERR_INVALID_CHARS 0x0080
7073#endif
7074
7075static char*
7076code_page_name(UINT code_page, PyObject **obj)
7077{
7078 *obj = NULL;
7079 if (code_page == CP_ACP)
7080 return "mbcs";
7081 if (code_page == CP_UTF7)
7082 return "CP_UTF7";
7083 if (code_page == CP_UTF8)
7084 return "CP_UTF8";
7085
7086 *obj = PyBytes_FromFormat("cp%u", code_page);
7087 if (*obj == NULL)
7088 return NULL;
7089 return PyBytes_AS_STRING(*obj);
7090}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007091
Alexander Belopolsky40018472011-02-26 01:02:56 +00007092static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007093is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094{
7095 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007096 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007097
Victor Stinner3a50e702011-10-18 21:21:00 +02007098 if (!IsDBCSLeadByteEx(code_page, *curr))
7099 return 0;
7100
7101 prev = CharPrevExA(code_page, s, curr, 0);
7102 if (prev == curr)
7103 return 1;
7104 /* FIXME: This code is limited to "true" double-byte encodings,
7105 as it assumes an incomplete character consists of a single
7106 byte. */
7107 if (curr - prev == 2)
7108 return 1;
7109 if (!IsDBCSLeadByteEx(code_page, *prev))
7110 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111 return 0;
7112}
7113
Victor Stinner3a50e702011-10-18 21:21:00 +02007114static DWORD
7115decode_code_page_flags(UINT code_page)
7116{
7117 if (code_page == CP_UTF7) {
7118 /* The CP_UTF7 decoder only supports flags=0 */
7119 return 0;
7120 }
7121 else
7122 return MB_ERR_INVALID_CHARS;
7123}
7124
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007125/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 * Decode a byte string from a Windows code page into unicode object in strict
7127 * mode.
7128 *
7129 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7130 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007131 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007132static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007133decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007134 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 const char *in,
7136 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137{
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007139 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007141
7142 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 assert(insize > 0);
7144 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7145 if (outsize <= 0)
7146 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007147
7148 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007150 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007151 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 if (*v == NULL)
7153 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007155 }
7156 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007157 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007158 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007159 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007160 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007162 }
7163
7164 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7166 if (outsize <= 0)
7167 goto error;
7168 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007169
Victor Stinner3a50e702011-10-18 21:21:00 +02007170error:
7171 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7172 return -2;
7173 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007174 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007175}
7176
Victor Stinner3a50e702011-10-18 21:21:00 +02007177/*
7178 * Decode a byte string from a code page into unicode object with an error
7179 * handler.
7180 *
7181 * Returns consumed size if succeed, or raise a WindowsError or
7182 * UnicodeDecodeError exception and returns -1 on error.
7183 */
7184static int
7185decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007186 PyObject **v,
7187 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 const char *errors)
7189{
7190 const char *startin = in;
7191 const char *endin = in + size;
7192 const DWORD flags = decode_code_page_flags(code_page);
7193 /* Ideally, we should get reason from FormatMessage. This is the Windows
7194 2000 English version of the message. */
7195 const char *reason = "No mapping for the Unicode character exists "
7196 "in the target code page.";
7197 /* each step cannot decode more than 1 character, but a character can be
7198 represented as a surrogate pair */
7199 wchar_t buffer[2], *startout, *out;
7200 int insize, outsize;
7201 PyObject *errorHandler = NULL;
7202 PyObject *exc = NULL;
7203 PyObject *encoding_obj = NULL;
7204 char *encoding;
7205 DWORD err;
7206 int ret = -1;
7207
7208 assert(size > 0);
7209
7210 encoding = code_page_name(code_page, &encoding_obj);
7211 if (encoding == NULL)
7212 return -1;
7213
7214 if (errors == NULL || strcmp(errors, "strict") == 0) {
7215 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7216 UnicodeDecodeError. */
7217 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7218 if (exc != NULL) {
7219 PyCodec_StrictErrors(exc);
7220 Py_CLEAR(exc);
7221 }
7222 goto error;
7223 }
7224
7225 if (*v == NULL) {
7226 /* Create unicode object */
7227 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7228 PyErr_NoMemory();
7229 goto error;
7230 }
Victor Stinnerab595942011-12-17 04:59:06 +01007231 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007232 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 if (*v == NULL)
7234 goto error;
7235 startout = PyUnicode_AS_UNICODE(*v);
7236 }
7237 else {
7238 /* Extend unicode object */
7239 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7240 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7241 PyErr_NoMemory();
7242 goto error;
7243 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007244 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 goto error;
7246 startout = PyUnicode_AS_UNICODE(*v) + n;
7247 }
7248
7249 /* Decode the byte string character per character */
7250 out = startout;
7251 while (in < endin)
7252 {
7253 /* Decode a character */
7254 insize = 1;
7255 do
7256 {
7257 outsize = MultiByteToWideChar(code_page, flags,
7258 in, insize,
7259 buffer, Py_ARRAY_LENGTH(buffer));
7260 if (outsize > 0)
7261 break;
7262 err = GetLastError();
7263 if (err != ERROR_NO_UNICODE_TRANSLATION
7264 && err != ERROR_INSUFFICIENT_BUFFER)
7265 {
7266 PyErr_SetFromWindowsErr(0);
7267 goto error;
7268 }
7269 insize++;
7270 }
7271 /* 4=maximum length of a UTF-8 sequence */
7272 while (insize <= 4 && (in + insize) <= endin);
7273
7274 if (outsize <= 0) {
7275 Py_ssize_t startinpos, endinpos, outpos;
7276
7277 startinpos = in - startin;
7278 endinpos = startinpos + 1;
7279 outpos = out - PyUnicode_AS_UNICODE(*v);
7280 if (unicode_decode_call_errorhandler(
7281 errors, &errorHandler,
7282 encoding, reason,
7283 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007284 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007285 {
7286 goto error;
7287 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007288 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007289 }
7290 else {
7291 in += insize;
7292 memcpy(out, buffer, outsize * sizeof(wchar_t));
7293 out += outsize;
7294 }
7295 }
7296
7297 /* write a NUL character at the end */
7298 *out = 0;
7299
7300 /* Extend unicode object */
7301 outsize = out - startout;
7302 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007303 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007305 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007306
7307error:
7308 Py_XDECREF(encoding_obj);
7309 Py_XDECREF(errorHandler);
7310 Py_XDECREF(exc);
7311 return ret;
7312}
7313
Victor Stinner3a50e702011-10-18 21:21:00 +02007314static PyObject *
7315decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007316 const char *s, Py_ssize_t size,
7317 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007318{
Victor Stinner76a31a62011-11-04 00:05:13 +01007319 PyObject *v = NULL;
7320 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321
Victor Stinner3a50e702011-10-18 21:21:00 +02007322 if (code_page < 0) {
7323 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7324 return NULL;
7325 }
7326
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007327 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007329
Victor Stinner76a31a62011-11-04 00:05:13 +01007330 do
7331 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007333 if (size > INT_MAX) {
7334 chunk_size = INT_MAX;
7335 final = 0;
7336 done = 0;
7337 }
7338 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007339#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007340 {
7341 chunk_size = (int)size;
7342 final = (consumed == NULL);
7343 done = 1;
7344 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007345
Victor Stinner76a31a62011-11-04 00:05:13 +01007346 /* Skip trailing lead-byte unless 'final' is set */
7347 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7348 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007349
Victor Stinner76a31a62011-11-04 00:05:13 +01007350 if (chunk_size == 0 && done) {
7351 if (v != NULL)
7352 break;
7353 Py_INCREF(unicode_empty);
7354 return unicode_empty;
7355 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007356
Victor Stinner76a31a62011-11-04 00:05:13 +01007357
7358 converted = decode_code_page_strict(code_page, &v,
7359 s, chunk_size);
7360 if (converted == -2)
7361 converted = decode_code_page_errors(code_page, &v,
7362 s, chunk_size,
7363 errors);
7364 assert(converted != 0);
7365
7366 if (converted < 0) {
7367 Py_XDECREF(v);
7368 return NULL;
7369 }
7370
7371 if (consumed)
7372 *consumed += converted;
7373
7374 s += converted;
7375 size -= converted;
7376 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007377
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007378 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007379}
7380
Alexander Belopolsky40018472011-02-26 01:02:56 +00007381PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007382PyUnicode_DecodeCodePageStateful(int code_page,
7383 const char *s,
7384 Py_ssize_t size,
7385 const char *errors,
7386 Py_ssize_t *consumed)
7387{
7388 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7389}
7390
7391PyObject *
7392PyUnicode_DecodeMBCSStateful(const char *s,
7393 Py_ssize_t size,
7394 const char *errors,
7395 Py_ssize_t *consumed)
7396{
7397 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7398}
7399
7400PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007401PyUnicode_DecodeMBCS(const char *s,
7402 Py_ssize_t size,
7403 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007404{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007405 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7406}
7407
Victor Stinner3a50e702011-10-18 21:21:00 +02007408static DWORD
7409encode_code_page_flags(UINT code_page, const char *errors)
7410{
7411 if (code_page == CP_UTF8) {
7412 if (winver.dwMajorVersion >= 6)
7413 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7414 and later */
7415 return WC_ERR_INVALID_CHARS;
7416 else
7417 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7418 return 0;
7419 }
7420 else if (code_page == CP_UTF7) {
7421 /* CP_UTF7 only supports flags=0 */
7422 return 0;
7423 }
7424 else {
7425 if (errors != NULL && strcmp(errors, "replace") == 0)
7426 return 0;
7427 else
7428 return WC_NO_BEST_FIT_CHARS;
7429 }
7430}
7431
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007432/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 * Encode a Unicode string to a Windows code page into a byte string in strict
7434 * mode.
7435 *
7436 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7437 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007438 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007439static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007440encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007441 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007443{
Victor Stinner554f3f02010-06-16 23:33:54 +00007444 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 BOOL *pusedDefaultChar = &usedDefaultChar;
7446 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007447 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007448 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007449 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 const DWORD flags = encode_code_page_flags(code_page, NULL);
7451 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007452 /* Create a substring so that we can get the UTF-16 representation
7453 of just the slice under consideration. */
7454 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007455
Martin v. Löwis3d325192011-11-04 18:23:06 +01007456 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007457
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007459 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007461 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007462
Victor Stinner2fc507f2011-11-04 20:06:39 +01007463 substring = PyUnicode_Substring(unicode, offset, offset+len);
7464 if (substring == NULL)
7465 return -1;
7466 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7467 if (p == NULL) {
7468 Py_DECREF(substring);
7469 return -1;
7470 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007471
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007472 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 outsize = WideCharToMultiByte(code_page, flags,
7474 p, size,
7475 NULL, 0,
7476 NULL, pusedDefaultChar);
7477 if (outsize <= 0)
7478 goto error;
7479 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007480 if (pusedDefaultChar && *pusedDefaultChar) {
7481 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007482 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007483 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007484
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007488 if (*outbytes == NULL) {
7489 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007493 }
7494 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 const Py_ssize_t n = PyBytes_Size(*outbytes);
7497 if (outsize > PY_SSIZE_T_MAX - n) {
7498 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007499 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007502 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7503 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007504 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007505 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007507 }
7508
7509 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 outsize = WideCharToMultiByte(code_page, flags,
7511 p, size,
7512 out, outsize,
7513 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007514 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 if (outsize <= 0)
7516 goto error;
7517 if (pusedDefaultChar && *pusedDefaultChar)
7518 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007519 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007520
Victor Stinner3a50e702011-10-18 21:21:00 +02007521error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007522 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7524 return -2;
7525 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007526 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007527}
7528
Victor Stinner3a50e702011-10-18 21:21:00 +02007529/*
7530 * Encode a Unicode string to a Windows code page into a byte string using a
7531 * error handler.
7532 *
7533 * Returns consumed characters if succeed, or raise a WindowsError and returns
7534 * -1 on other error.
7535 */
7536static int
7537encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007538 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007539 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007540{
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007542 Py_ssize_t pos = unicode_offset;
7543 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 /* Ideally, we should get reason from FormatMessage. This is the Windows
7545 2000 English version of the message. */
7546 const char *reason = "invalid character";
7547 /* 4=maximum length of a UTF-8 sequence */
7548 char buffer[4];
7549 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7550 Py_ssize_t outsize;
7551 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007552 PyObject *errorHandler = NULL;
7553 PyObject *exc = NULL;
7554 PyObject *encoding_obj = NULL;
7555 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007556 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007557 PyObject *rep;
7558 int ret = -1;
7559
7560 assert(insize > 0);
7561
7562 encoding = code_page_name(code_page, &encoding_obj);
7563 if (encoding == NULL)
7564 return -1;
7565
7566 if (errors == NULL || strcmp(errors, "strict") == 0) {
7567 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7568 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007569 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007570 if (exc != NULL) {
7571 PyCodec_StrictErrors(exc);
7572 Py_DECREF(exc);
7573 }
7574 Py_XDECREF(encoding_obj);
7575 return -1;
7576 }
7577
7578 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7579 pusedDefaultChar = &usedDefaultChar;
7580 else
7581 pusedDefaultChar = NULL;
7582
7583 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7584 PyErr_NoMemory();
7585 goto error;
7586 }
7587 outsize = insize * Py_ARRAY_LENGTH(buffer);
7588
7589 if (*outbytes == NULL) {
7590 /* Create string object */
7591 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7592 if (*outbytes == NULL)
7593 goto error;
7594 out = PyBytes_AS_STRING(*outbytes);
7595 }
7596 else {
7597 /* Extend string object */
7598 Py_ssize_t n = PyBytes_Size(*outbytes);
7599 if (n > PY_SSIZE_T_MAX - outsize) {
7600 PyErr_NoMemory();
7601 goto error;
7602 }
7603 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7604 goto error;
7605 out = PyBytes_AS_STRING(*outbytes) + n;
7606 }
7607
7608 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007609 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007610 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007611 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7612 wchar_t chars[2];
7613 int charsize;
7614 if (ch < 0x10000) {
7615 chars[0] = (wchar_t)ch;
7616 charsize = 1;
7617 }
7618 else {
7619 ch -= 0x10000;
7620 chars[0] = 0xd800 + (ch >> 10);
7621 chars[1] = 0xdc00 + (ch & 0x3ff);
7622 charsize = 2;
7623 }
7624
Victor Stinner3a50e702011-10-18 21:21:00 +02007625 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007626 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 buffer, Py_ARRAY_LENGTH(buffer),
7628 NULL, pusedDefaultChar);
7629 if (outsize > 0) {
7630 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7631 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007632 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007633 memcpy(out, buffer, outsize);
7634 out += outsize;
7635 continue;
7636 }
7637 }
7638 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7639 PyErr_SetFromWindowsErr(0);
7640 goto error;
7641 }
7642
Victor Stinner3a50e702011-10-18 21:21:00 +02007643 rep = unicode_encode_call_errorhandler(
7644 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007645 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007646 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 if (rep == NULL)
7648 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007649 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007650
7651 if (PyBytes_Check(rep)) {
7652 outsize = PyBytes_GET_SIZE(rep);
7653 if (outsize != 1) {
7654 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7655 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7656 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7657 Py_DECREF(rep);
7658 goto error;
7659 }
7660 out = PyBytes_AS_STRING(*outbytes) + offset;
7661 }
7662 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7663 out += outsize;
7664 }
7665 else {
7666 Py_ssize_t i;
7667 enum PyUnicode_Kind kind;
7668 void *data;
7669
Benjamin Petersonbac79492012-01-14 13:34:47 -05007670 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007671 Py_DECREF(rep);
7672 goto error;
7673 }
7674
7675 outsize = PyUnicode_GET_LENGTH(rep);
7676 if (outsize != 1) {
7677 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7678 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7679 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7680 Py_DECREF(rep);
7681 goto error;
7682 }
7683 out = PyBytes_AS_STRING(*outbytes) + offset;
7684 }
7685 kind = PyUnicode_KIND(rep);
7686 data = PyUnicode_DATA(rep);
7687 for (i=0; i < outsize; i++) {
7688 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7689 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007690 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007691 encoding, unicode,
7692 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007693 "unable to encode error handler result to ASCII");
7694 Py_DECREF(rep);
7695 goto error;
7696 }
7697 *out = (unsigned char)ch;
7698 out++;
7699 }
7700 }
7701 Py_DECREF(rep);
7702 }
7703 /* write a NUL byte */
7704 *out = 0;
7705 outsize = out - PyBytes_AS_STRING(*outbytes);
7706 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7707 if (_PyBytes_Resize(outbytes, outsize) < 0)
7708 goto error;
7709 ret = 0;
7710
7711error:
7712 Py_XDECREF(encoding_obj);
7713 Py_XDECREF(errorHandler);
7714 Py_XDECREF(exc);
7715 return ret;
7716}
7717
Victor Stinner3a50e702011-10-18 21:21:00 +02007718static PyObject *
7719encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007720 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007721 const char *errors)
7722{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007723 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007724 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007725 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007726 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007727
Benjamin Petersonbac79492012-01-14 13:34:47 -05007728 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007729 return NULL;
7730 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007731
Victor Stinner3a50e702011-10-18 21:21:00 +02007732 if (code_page < 0) {
7733 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7734 return NULL;
7735 }
7736
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007738 return PyBytes_FromStringAndSize(NULL, 0);
7739
Victor Stinner7581cef2011-11-03 22:32:33 +01007740 offset = 0;
7741 do
7742 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007743#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007744 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007745 chunks. */
7746 if (len > INT_MAX/2) {
7747 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007748 done = 0;
7749 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007750 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007751#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007752 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007753 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007754 done = 1;
7755 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007756
Victor Stinner76a31a62011-11-04 00:05:13 +01007757 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007758 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007759 errors);
7760 if (ret == -2)
7761 ret = encode_code_page_errors(code_page, &outbytes,
7762 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007763 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007764 if (ret < 0) {
7765 Py_XDECREF(outbytes);
7766 return NULL;
7767 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007768
Victor Stinner7581cef2011-11-03 22:32:33 +01007769 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007770 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007771 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007772
Victor Stinner3a50e702011-10-18 21:21:00 +02007773 return outbytes;
7774}
7775
7776PyObject *
7777PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7778 Py_ssize_t size,
7779 const char *errors)
7780{
Victor Stinner7581cef2011-11-03 22:32:33 +01007781 PyObject *unicode, *res;
7782 unicode = PyUnicode_FromUnicode(p, size);
7783 if (unicode == NULL)
7784 return NULL;
7785 res = encode_code_page(CP_ACP, unicode, errors);
7786 Py_DECREF(unicode);
7787 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007788}
7789
7790PyObject *
7791PyUnicode_EncodeCodePage(int code_page,
7792 PyObject *unicode,
7793 const char *errors)
7794{
Victor Stinner7581cef2011-11-03 22:32:33 +01007795 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007796}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007797
Alexander Belopolsky40018472011-02-26 01:02:56 +00007798PyObject *
7799PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007800{
7801 if (!PyUnicode_Check(unicode)) {
7802 PyErr_BadArgument();
7803 return NULL;
7804 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007805 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007806}
7807
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007808#undef NEED_RETRY
7809
Victor Stinner99b95382011-07-04 14:23:54 +02007810#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007811
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812/* --- Character Mapping Codec -------------------------------------------- */
7813
Alexander Belopolsky40018472011-02-26 01:02:56 +00007814PyObject *
7815PyUnicode_DecodeCharmap(const char *s,
7816 Py_ssize_t size,
7817 PyObject *mapping,
7818 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007820 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007821 Py_ssize_t startinpos;
7822 Py_ssize_t endinpos;
7823 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007824 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007825 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007826 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 PyObject *errorHandler = NULL;
7828 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007829
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 /* Default to Latin-1 */
7831 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007834 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007838 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007839 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007840 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007841 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007842 Py_ssize_t maplen;
7843 enum PyUnicode_Kind kind;
7844 void *data;
7845 Py_UCS4 x;
7846
Benjamin Petersonbac79492012-01-14 13:34:47 -05007847 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007848 return NULL;
7849
7850 maplen = PyUnicode_GET_LENGTH(mapping);
7851 data = PyUnicode_DATA(mapping);
7852 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 while (s < e) {
7854 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007857 x = PyUnicode_READ(kind, data, ch);
7858 else
7859 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007861 if (x == 0xfffe)
7862 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 startinpos = s-starts;
7865 endinpos = startinpos+1;
7866 if (unicode_decode_call_errorhandler(
7867 errors, &errorHandler,
7868 "charmap", "character maps to <undefined>",
7869 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007870 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007871 goto onError;
7872 }
7873 continue;
7874 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007875
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007876 if (unicode_putchar(&v, &outpos, x) < 0)
7877 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007879 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007880 }
7881 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 while (s < e) {
7883 unsigned char ch = *s;
7884 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007885
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7887 w = PyLong_FromLong((long)ch);
7888 if (w == NULL)
7889 goto onError;
7890 x = PyObject_GetItem(mapping, w);
7891 Py_DECREF(w);
7892 if (x == NULL) {
7893 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7894 /* No mapping found means: mapping is undefined. */
7895 PyErr_Clear();
7896 x = Py_None;
7897 Py_INCREF(x);
7898 } else
7899 goto onError;
7900 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007901
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 /* Apply mapping */
7903 if (PyLong_Check(x)) {
7904 long value = PyLong_AS_LONG(x);
7905 if (value < 0 || value > 65535) {
7906 PyErr_SetString(PyExc_TypeError,
7907 "character mapping must be in range(65536)");
7908 Py_DECREF(x);
7909 goto onError;
7910 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007911 if (unicode_putchar(&v, &outpos, value) < 0)
7912 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 }
7914 else if (x == Py_None) {
7915 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 startinpos = s-starts;
7917 endinpos = startinpos+1;
7918 if (unicode_decode_call_errorhandler(
7919 errors, &errorHandler,
7920 "charmap", "character maps to <undefined>",
7921 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007922 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 Py_DECREF(x);
7924 goto onError;
7925 }
7926 Py_DECREF(x);
7927 continue;
7928 }
7929 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007930 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007931
Benjamin Petersonbac79492012-01-14 13:34:47 -05007932 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007933 goto onError;
7934 targetsize = PyUnicode_GET_LENGTH(x);
7935
7936 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007938 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007939 PyUnicode_READ_CHAR(x, 0)) < 0)
7940 goto onError;
7941 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 else if (targetsize > 1) {
7943 /* 1-n mapping */
7944 if (targetsize > extrachars) {
7945 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 Py_ssize_t needed = (targetsize - extrachars) + \
7947 (targetsize << 2);
7948 extrachars += needed;
7949 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007950 if (unicode_resize(&v,
7951 PyUnicode_GET_LENGTH(v) + needed) < 0)
7952 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 Py_DECREF(x);
7954 goto onError;
7955 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007957 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7958 goto onError;
7959 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7960 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 extrachars -= targetsize;
7962 }
7963 /* 1-0 mapping: skip the character */
7964 }
7965 else {
7966 /* wrong return value */
7967 PyErr_SetString(PyExc_TypeError,
7968 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007969 Py_DECREF(x);
7970 goto onError;
7971 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 Py_DECREF(x);
7973 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007976 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007977 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007978 Py_XDECREF(errorHandler);
7979 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007980 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007981
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007983 Py_XDECREF(errorHandler);
7984 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 Py_XDECREF(v);
7986 return NULL;
7987}
7988
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007989/* Charmap encoding: the lookup table */
7990
Alexander Belopolsky40018472011-02-26 01:02:56 +00007991struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 PyObject_HEAD
7993 unsigned char level1[32];
7994 int count2, count3;
7995 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007996};
7997
7998static PyObject*
7999encoding_map_size(PyObject *obj, PyObject* args)
8000{
8001 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008002 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008004}
8005
8006static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008007 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 PyDoc_STR("Return the size (in bytes) of this object") },
8009 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008010};
8011
8012static void
8013encoding_map_dealloc(PyObject* o)
8014{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008015 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008016}
8017
8018static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008019 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 "EncodingMap", /*tp_name*/
8021 sizeof(struct encoding_map), /*tp_basicsize*/
8022 0, /*tp_itemsize*/
8023 /* methods */
8024 encoding_map_dealloc, /*tp_dealloc*/
8025 0, /*tp_print*/
8026 0, /*tp_getattr*/
8027 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008028 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 0, /*tp_repr*/
8030 0, /*tp_as_number*/
8031 0, /*tp_as_sequence*/
8032 0, /*tp_as_mapping*/
8033 0, /*tp_hash*/
8034 0, /*tp_call*/
8035 0, /*tp_str*/
8036 0, /*tp_getattro*/
8037 0, /*tp_setattro*/
8038 0, /*tp_as_buffer*/
8039 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8040 0, /*tp_doc*/
8041 0, /*tp_traverse*/
8042 0, /*tp_clear*/
8043 0, /*tp_richcompare*/
8044 0, /*tp_weaklistoffset*/
8045 0, /*tp_iter*/
8046 0, /*tp_iternext*/
8047 encoding_map_methods, /*tp_methods*/
8048 0, /*tp_members*/
8049 0, /*tp_getset*/
8050 0, /*tp_base*/
8051 0, /*tp_dict*/
8052 0, /*tp_descr_get*/
8053 0, /*tp_descr_set*/
8054 0, /*tp_dictoffset*/
8055 0, /*tp_init*/
8056 0, /*tp_alloc*/
8057 0, /*tp_new*/
8058 0, /*tp_free*/
8059 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060};
8061
8062PyObject*
8063PyUnicode_BuildEncodingMap(PyObject* string)
8064{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065 PyObject *result;
8066 struct encoding_map *mresult;
8067 int i;
8068 int need_dict = 0;
8069 unsigned char level1[32];
8070 unsigned char level2[512];
8071 unsigned char *mlevel1, *mlevel2, *mlevel3;
8072 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008073 int kind;
8074 void *data;
8075 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008077 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008078 PyErr_BadArgument();
8079 return NULL;
8080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008081 kind = PyUnicode_KIND(string);
8082 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083 memset(level1, 0xFF, sizeof level1);
8084 memset(level2, 0xFF, sizeof level2);
8085
8086 /* If there isn't a one-to-one mapping of NULL to \0,
8087 or if there are non-BMP characters, we need to use
8088 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008089 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090 need_dict = 1;
8091 for (i = 1; i < 256; i++) {
8092 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008093 ch = PyUnicode_READ(kind, data, i);
8094 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008095 need_dict = 1;
8096 break;
8097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008098 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008099 /* unmapped character */
8100 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008101 l1 = ch >> 11;
8102 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008103 if (level1[l1] == 0xFF)
8104 level1[l1] = count2++;
8105 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008106 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107 }
8108
8109 if (count2 >= 0xFF || count3 >= 0xFF)
8110 need_dict = 1;
8111
8112 if (need_dict) {
8113 PyObject *result = PyDict_New();
8114 PyObject *key, *value;
8115 if (!result)
8116 return NULL;
8117 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008118 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008119 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 if (!key || !value)
8121 goto failed1;
8122 if (PyDict_SetItem(result, key, value) == -1)
8123 goto failed1;
8124 Py_DECREF(key);
8125 Py_DECREF(value);
8126 }
8127 return result;
8128 failed1:
8129 Py_XDECREF(key);
8130 Py_XDECREF(value);
8131 Py_DECREF(result);
8132 return NULL;
8133 }
8134
8135 /* Create a three-level trie */
8136 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8137 16*count2 + 128*count3 - 1);
8138 if (!result)
8139 return PyErr_NoMemory();
8140 PyObject_Init(result, &EncodingMapType);
8141 mresult = (struct encoding_map*)result;
8142 mresult->count2 = count2;
8143 mresult->count3 = count3;
8144 mlevel1 = mresult->level1;
8145 mlevel2 = mresult->level23;
8146 mlevel3 = mresult->level23 + 16*count2;
8147 memcpy(mlevel1, level1, 32);
8148 memset(mlevel2, 0xFF, 16*count2);
8149 memset(mlevel3, 0, 128*count3);
8150 count3 = 0;
8151 for (i = 1; i < 256; i++) {
8152 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008153 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008154 /* unmapped character */
8155 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 o1 = PyUnicode_READ(kind, data, i)>>11;
8157 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008158 i2 = 16*mlevel1[o1] + o2;
8159 if (mlevel2[i2] == 0xFF)
8160 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162 i3 = 128*mlevel2[i2] + o3;
8163 mlevel3[i3] = i;
8164 }
8165 return result;
8166}
8167
8168static int
Victor Stinner22168992011-11-20 17:09:18 +01008169encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170{
8171 struct encoding_map *map = (struct encoding_map*)mapping;
8172 int l1 = c>>11;
8173 int l2 = (c>>7) & 0xF;
8174 int l3 = c & 0x7F;
8175 int i;
8176
Victor Stinner22168992011-11-20 17:09:18 +01008177 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008179 if (c == 0)
8180 return 0;
8181 /* level 1*/
8182 i = map->level1[l1];
8183 if (i == 0xFF) {
8184 return -1;
8185 }
8186 /* level 2*/
8187 i = map->level23[16*i+l2];
8188 if (i == 0xFF) {
8189 return -1;
8190 }
8191 /* level 3 */
8192 i = map->level23[16*map->count2 + 128*i + l3];
8193 if (i == 0) {
8194 return -1;
8195 }
8196 return i;
8197}
8198
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008199/* Lookup the character ch in the mapping. If the character
8200 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008201 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008202static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008203charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204{
Christian Heimes217cfd12007-12-02 14:31:20 +00008205 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008206 PyObject *x;
8207
8208 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210 x = PyObject_GetItem(mapping, w);
8211 Py_DECREF(w);
8212 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008213 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8214 /* No mapping found means: mapping is undefined. */
8215 PyErr_Clear();
8216 x = Py_None;
8217 Py_INCREF(x);
8218 return x;
8219 } else
8220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008222 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008224 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 long value = PyLong_AS_LONG(x);
8226 if (value < 0 || value > 255) {
8227 PyErr_SetString(PyExc_TypeError,
8228 "character mapping must be in range(256)");
8229 Py_DECREF(x);
8230 return NULL;
8231 }
8232 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008234 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 /* wrong return value */
8238 PyErr_Format(PyExc_TypeError,
8239 "character mapping must return integer, bytes or None, not %.400s",
8240 x->ob_type->tp_name);
8241 Py_DECREF(x);
8242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 }
8244}
8245
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008246static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008247charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008248{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008249 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8250 /* exponentially overallocate to minimize reallocations */
8251 if (requiredsize < 2*outsize)
8252 requiredsize = 2*outsize;
8253 if (_PyBytes_Resize(outobj, requiredsize))
8254 return -1;
8255 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008256}
8257
Benjamin Peterson14339b62009-01-31 16:36:08 +00008258typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008260} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008261/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008262 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263 space is available. Return a new reference to the object that
8264 was put in the output buffer, or Py_None, if the mapping was undefined
8265 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008266 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008267static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008268charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008269 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008271 PyObject *rep;
8272 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008273 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274
Christian Heimes90aa7642007-12-19 02:45:37 +00008275 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008276 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008278 if (res == -1)
8279 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 if (outsize<requiredsize)
8281 if (charmapencode_resize(outobj, outpos, requiredsize))
8282 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008283 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 outstart[(*outpos)++] = (char)res;
8285 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008286 }
8287
8288 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008291 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 Py_DECREF(rep);
8293 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008294 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 if (PyLong_Check(rep)) {
8296 Py_ssize_t requiredsize = *outpos+1;
8297 if (outsize<requiredsize)
8298 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8299 Py_DECREF(rep);
8300 return enc_EXCEPTION;
8301 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008302 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008304 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 else {
8306 const char *repchars = PyBytes_AS_STRING(rep);
8307 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8308 Py_ssize_t requiredsize = *outpos+repsize;
8309 if (outsize<requiredsize)
8310 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8311 Py_DECREF(rep);
8312 return enc_EXCEPTION;
8313 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008314 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 memcpy(outstart + *outpos, repchars, repsize);
8316 *outpos += repsize;
8317 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008318 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008319 Py_DECREF(rep);
8320 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321}
8322
8323/* handle an error in PyUnicode_EncodeCharmap
8324 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008325static int
8326charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008327 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008329 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008330 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331{
8332 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008333 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008334 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008335 enum PyUnicode_Kind kind;
8336 void *data;
8337 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008338 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008339 Py_ssize_t collstartpos = *inpos;
8340 Py_ssize_t collendpos = *inpos+1;
8341 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008342 char *encoding = "charmap";
8343 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008344 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008345 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008346 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347
Benjamin Petersonbac79492012-01-14 13:34:47 -05008348 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008349 return -1;
8350 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351 /* find all unencodable characters */
8352 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008353 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008354 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008355 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008356 val = encoding_map_lookup(ch, mapping);
8357 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 break;
8359 ++collendpos;
8360 continue;
8361 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008362
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008363 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8364 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 if (rep==NULL)
8366 return -1;
8367 else if (rep!=Py_None) {
8368 Py_DECREF(rep);
8369 break;
8370 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008371 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373 }
8374 /* cache callback name lookup
8375 * (if not done yet, i.e. it's the first error) */
8376 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 if ((errors==NULL) || (!strcmp(errors, "strict")))
8378 *known_errorHandler = 1;
8379 else if (!strcmp(errors, "replace"))
8380 *known_errorHandler = 2;
8381 else if (!strcmp(errors, "ignore"))
8382 *known_errorHandler = 3;
8383 else if (!strcmp(errors, "xmlcharrefreplace"))
8384 *known_errorHandler = 4;
8385 else
8386 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 }
8388 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008389 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008390 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008391 return -1;
8392 case 2: /* replace */
8393 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 x = charmapencode_output('?', mapping, res, respos);
8395 if (x==enc_EXCEPTION) {
8396 return -1;
8397 }
8398 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008399 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 return -1;
8401 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008402 }
8403 /* fall through */
8404 case 3: /* ignore */
8405 *inpos = collendpos;
8406 break;
8407 case 4: /* xmlcharrefreplace */
8408 /* generate replacement (temporarily (mis)uses p) */
8409 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 char buffer[2+29+1+1];
8411 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008412 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 for (cp = buffer; *cp; ++cp) {
8414 x = charmapencode_output(*cp, mapping, res, respos);
8415 if (x==enc_EXCEPTION)
8416 return -1;
8417 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008418 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 return -1;
8420 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421 }
8422 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008423 *inpos = collendpos;
8424 break;
8425 default:
8426 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008427 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008429 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008431 if (PyBytes_Check(repunicode)) {
8432 /* Directly copy bytes result to output. */
8433 Py_ssize_t outsize = PyBytes_Size(*res);
8434 Py_ssize_t requiredsize;
8435 repsize = PyBytes_Size(repunicode);
8436 requiredsize = *respos + repsize;
8437 if (requiredsize > outsize)
8438 /* Make room for all additional bytes. */
8439 if (charmapencode_resize(res, respos, requiredsize)) {
8440 Py_DECREF(repunicode);
8441 return -1;
8442 }
8443 memcpy(PyBytes_AsString(*res) + *respos,
8444 PyBytes_AsString(repunicode), repsize);
8445 *respos += repsize;
8446 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008447 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008448 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008449 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008450 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008451 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008452 Py_DECREF(repunicode);
8453 return -1;
8454 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008455 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008456 data = PyUnicode_DATA(repunicode);
8457 kind = PyUnicode_KIND(repunicode);
8458 for (index = 0; index < repsize; index++) {
8459 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8460 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008462 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 return -1;
8464 }
8465 else if (x==enc_FAILED) {
8466 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008467 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 return -1;
8469 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008470 }
8471 *inpos = newpos;
8472 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473 }
8474 return 0;
8475}
8476
Alexander Belopolsky40018472011-02-26 01:02:56 +00008477PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008478_PyUnicode_EncodeCharmap(PyObject *unicode,
8479 PyObject *mapping,
8480 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482 /* output object */
8483 PyObject *res = NULL;
8484 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008485 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008486 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008487 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008488 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 PyObject *errorHandler = NULL;
8490 PyObject *exc = NULL;
8491 /* the following variable is used for caching string comparisons
8492 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8493 * 3=ignore, 4=xmlcharrefreplace */
8494 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495
Benjamin Petersonbac79492012-01-14 13:34:47 -05008496 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008497 return NULL;
8498 size = PyUnicode_GET_LENGTH(unicode);
8499
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 /* Default to Latin-1 */
8501 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008502 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008504 /* allocate enough for a simple encoding without
8505 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008506 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507 if (res == NULL)
8508 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008509 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008512 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008513 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008515 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 if (x==enc_EXCEPTION) /* error */
8517 goto onError;
8518 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008519 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 &exc,
8521 &known_errorHandler, &errorHandler, errors,
8522 &res, &respos)) {
8523 goto onError;
8524 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008525 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 else
8527 /* done with this character => adjust input position */
8528 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008532 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008533 if (_PyBytes_Resize(&res, respos) < 0)
8534 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008535
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008536 Py_XDECREF(exc);
8537 Py_XDECREF(errorHandler);
8538 return res;
8539
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 Py_XDECREF(res);
8542 Py_XDECREF(exc);
8543 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544 return NULL;
8545}
8546
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008547/* Deprecated */
8548PyObject *
8549PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8550 Py_ssize_t size,
8551 PyObject *mapping,
8552 const char *errors)
8553{
8554 PyObject *result;
8555 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8556 if (unicode == NULL)
8557 return NULL;
8558 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8559 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008560 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008561}
8562
Alexander Belopolsky40018472011-02-26 01:02:56 +00008563PyObject *
8564PyUnicode_AsCharmapString(PyObject *unicode,
8565 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566{
8567 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 PyErr_BadArgument();
8569 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008571 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572}
8573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008575static void
8576make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008577 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008578 Py_ssize_t startpos, Py_ssize_t endpos,
8579 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 *exceptionObject = _PyUnicodeTranslateError_Create(
8583 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 }
8585 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8587 goto onError;
8588 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8589 goto onError;
8590 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8591 goto onError;
8592 return;
8593 onError:
8594 Py_DECREF(*exceptionObject);
8595 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596 }
8597}
8598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008600static void
8601raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008603 Py_ssize_t startpos, Py_ssize_t endpos,
8604 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008605{
8606 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008608 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008610}
8611
8612/* error handling callback helper:
8613 build arguments, call the callback and check the arguments,
8614 put the result into newpos and return the replacement string, which
8615 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008616static PyObject *
8617unicode_translate_call_errorhandler(const char *errors,
8618 PyObject **errorHandler,
8619 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008621 Py_ssize_t startpos, Py_ssize_t endpos,
8622 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008624 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008626 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008627 PyObject *restuple;
8628 PyObject *resunicode;
8629
8630 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634 }
8635
8636 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008640
8641 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008646 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 Py_DECREF(restuple);
8648 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649 }
8650 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 &resunicode, &i_newpos)) {
8652 Py_DECREF(restuple);
8653 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008655 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008657 else
8658 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8661 Py_DECREF(restuple);
8662 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008663 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008664 Py_INCREF(resunicode);
8665 Py_DECREF(restuple);
8666 return resunicode;
8667}
8668
8669/* Lookup the character ch in the mapping and put the result in result,
8670 which must be decrefed by the caller.
8671 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008672static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674{
Christian Heimes217cfd12007-12-02 14:31:20 +00008675 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 PyObject *x;
8677
8678 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 x = PyObject_GetItem(mapping, w);
8681 Py_DECREF(w);
8682 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8684 /* No mapping found means: use 1:1 mapping. */
8685 PyErr_Clear();
8686 *result = NULL;
8687 return 0;
8688 } else
8689 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690 }
8691 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 *result = x;
8693 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008695 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 long value = PyLong_AS_LONG(x);
8697 long max = PyUnicode_GetMax();
8698 if (value < 0 || value > max) {
8699 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008700 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 Py_DECREF(x);
8702 return -1;
8703 }
8704 *result = x;
8705 return 0;
8706 }
8707 else if (PyUnicode_Check(x)) {
8708 *result = x;
8709 return 0;
8710 }
8711 else {
8712 /* wrong return value */
8713 PyErr_SetString(PyExc_TypeError,
8714 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008715 Py_DECREF(x);
8716 return -1;
8717 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718}
8719/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 if not reallocate and adjust various state variables.
8721 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008722static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008725{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008727 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 /* exponentially overallocate to minimize reallocations */
8729 if (requiredsize < 2 * oldsize)
8730 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8732 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735 }
8736 return 0;
8737}
8738/* lookup the character, put the result in the output string and adjust
8739 various state variables. Return a new reference to the object that
8740 was put in the output buffer in *result, or Py_None, if the mapping was
8741 undefined (in which case no character was written).
8742 The called must decref result.
8743 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008744static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8746 PyObject *mapping, Py_UCS4 **output,
8747 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008748 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8751 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008753 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008756 }
8757 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008759 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008761 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008762 }
8763 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 Py_ssize_t repsize;
8765 if (PyUnicode_READY(*res) == -1)
8766 return -1;
8767 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 if (repsize==1) {
8769 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 }
8772 else if (repsize!=0) {
8773 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 Py_ssize_t requiredsize = *opos +
8775 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 Py_ssize_t i;
8778 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 for(i = 0; i < repsize; i++)
8781 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783 }
8784 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008786 return 0;
8787}
8788
Alexander Belopolsky40018472011-02-26 01:02:56 +00008789PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008790_PyUnicode_TranslateCharmap(PyObject *input,
8791 PyObject *mapping,
8792 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 /* input object */
8795 char *idata;
8796 Py_ssize_t size, i;
8797 int kind;
8798 /* output buffer */
8799 Py_UCS4 *output = NULL;
8800 Py_ssize_t osize;
8801 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008802 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008804 char *reason = "character maps to <undefined>";
8805 PyObject *errorHandler = NULL;
8806 PyObject *exc = NULL;
8807 /* the following variable is used for caching string comparisons
8808 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8809 * 3=ignore, 4=xmlcharrefreplace */
8810 int known_errorHandler = -1;
8811
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008813 PyErr_BadArgument();
8814 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008815 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 if (PyUnicode_READY(input) == -1)
8818 return NULL;
8819 idata = (char*)PyUnicode_DATA(input);
8820 kind = PyUnicode_KIND(input);
8821 size = PyUnicode_GET_LENGTH(input);
8822 i = 0;
8823
8824 if (size == 0) {
8825 Py_INCREF(input);
8826 return input;
8827 }
8828
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008829 /* allocate enough for a simple 1:1 translation without
8830 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 osize = size;
8832 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8833 opos = 0;
8834 if (output == NULL) {
8835 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 /* try to encode it */
8841 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 if (charmaptranslate_output(input, i, mapping,
8843 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 Py_XDECREF(x);
8845 goto onError;
8846 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008847 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008848 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 else { /* untranslatable character */
8851 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8852 Py_ssize_t repsize;
8853 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 Py_ssize_t collstart = i;
8857 Py_ssize_t collend = i+1;
8858 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859
Benjamin Peterson29060642009-01-31 22:14:21 +00008860 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861 while (collend < size) {
8862 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 goto onError;
8864 Py_XDECREF(x);
8865 if (x!=Py_None)
8866 break;
8867 ++collend;
8868 }
8869 /* cache callback name lookup
8870 * (if not done yet, i.e. it's the first error) */
8871 if (known_errorHandler==-1) {
8872 if ((errors==NULL) || (!strcmp(errors, "strict")))
8873 known_errorHandler = 1;
8874 else if (!strcmp(errors, "replace"))
8875 known_errorHandler = 2;
8876 else if (!strcmp(errors, "ignore"))
8877 known_errorHandler = 3;
8878 else if (!strcmp(errors, "xmlcharrefreplace"))
8879 known_errorHandler = 4;
8880 else
8881 known_errorHandler = 0;
8882 }
8883 switch (known_errorHandler) {
8884 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 raise_translate_exception(&exc, input, collstart,
8886 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008887 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 case 2: /* replace */
8889 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008890 for (coll = collstart; coll<collend; coll++)
8891 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 /* fall through */
8893 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 break;
8896 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897 /* generate replacement (temporarily (mis)uses i) */
8898 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008899 char buffer[2+29+1+1];
8900 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8902 if (charmaptranslate_makespace(&output, &osize,
8903 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 goto onError;
8905 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 break;
8910 default:
8911 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008912 reason, input, &exc,
8913 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008914 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008916 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008917 Py_DECREF(repunicode);
8918 goto onError;
8919 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008921 repsize = PyUnicode_GET_LENGTH(repunicode);
8922 if (charmaptranslate_makespace(&output, &osize,
8923 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 Py_DECREF(repunicode);
8925 goto onError;
8926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 for (uni2 = 0; repsize-->0; ++uni2)
8928 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8929 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008931 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008932 }
8933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8935 if (!res)
8936 goto onError;
8937 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008938 Py_XDECREF(exc);
8939 Py_XDECREF(errorHandler);
8940 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008944 Py_XDECREF(exc);
8945 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 return NULL;
8947}
8948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949/* Deprecated. Use PyUnicode_Translate instead. */
8950PyObject *
8951PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8952 Py_ssize_t size,
8953 PyObject *mapping,
8954 const char *errors)
8955{
8956 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8957 if (!unicode)
8958 return NULL;
8959 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8960}
8961
Alexander Belopolsky40018472011-02-26 01:02:56 +00008962PyObject *
8963PyUnicode_Translate(PyObject *str,
8964 PyObject *mapping,
8965 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966{
8967 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008968
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 str = PyUnicode_FromObject(str);
8970 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973 Py_DECREF(str);
8974 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008975
Benjamin Peterson29060642009-01-31 22:14:21 +00008976 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 Py_XDECREF(str);
8978 return NULL;
8979}
Tim Petersced69f82003-09-16 20:30:58 +00008980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008982fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983{
8984 /* No need to call PyUnicode_READY(self) because this function is only
8985 called as a callback from fixup() which does it already. */
8986 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8987 const int kind = PyUnicode_KIND(self);
8988 void *data = PyUnicode_DATA(self);
8989 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008990 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 Py_ssize_t i;
8992
8993 for (i = 0; i < len; ++i) {
8994 ch = PyUnicode_READ(kind, data, i);
8995 fixed = 0;
8996 if (ch > 127) {
8997 if (Py_UNICODE_ISSPACE(ch))
8998 fixed = ' ';
8999 else {
9000 const int decimal = Py_UNICODE_TODECIMAL(ch);
9001 if (decimal >= 0)
9002 fixed = '0' + decimal;
9003 }
9004 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009005 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 if (fixed > maxchar)
9007 maxchar = fixed;
9008 PyUnicode_WRITE(kind, data, i, fixed);
9009 }
9010 else if (ch > maxchar)
9011 maxchar = ch;
9012 }
9013 else if (ch > maxchar)
9014 maxchar = ch;
9015 }
9016
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009017 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018}
9019
9020PyObject *
9021_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9022{
9023 if (!PyUnicode_Check(unicode)) {
9024 PyErr_BadInternalCall();
9025 return NULL;
9026 }
9027 if (PyUnicode_READY(unicode) == -1)
9028 return NULL;
9029 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9030 /* If the string is already ASCII, just return the same string */
9031 Py_INCREF(unicode);
9032 return unicode;
9033 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009034 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035}
9036
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009037PyObject *
9038PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9039 Py_ssize_t length)
9040{
Victor Stinnerf0124502011-11-21 23:12:56 +01009041 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009042 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009043 Py_UCS4 maxchar;
9044 enum PyUnicode_Kind kind;
9045 void *data;
9046
Victor Stinner99d7ad02012-02-22 13:37:39 +01009047 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009048 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009049 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009050 if (ch > 127) {
9051 int decimal = Py_UNICODE_TODECIMAL(ch);
9052 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009053 ch = '0' + decimal;
Victor Stinner99d7ad02012-02-22 13:37:39 +01009054 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009055 }
9056 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009057
9058 /* Copy to a new string */
9059 decimal = PyUnicode_New(length, maxchar);
9060 if (decimal == NULL)
9061 return decimal;
9062 kind = PyUnicode_KIND(decimal);
9063 data = PyUnicode_DATA(decimal);
9064 /* Iterate over code points */
9065 for (i = 0; i < length; i++) {
9066 Py_UNICODE ch = s[i];
9067 if (ch > 127) {
9068 int decimal = Py_UNICODE_TODECIMAL(ch);
9069 if (decimal >= 0)
9070 ch = '0' + decimal;
9071 }
9072 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009074 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009075}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009076/* --- Decimal Encoder ---------------------------------------------------- */
9077
Alexander Belopolsky40018472011-02-26 01:02:56 +00009078int
9079PyUnicode_EncodeDecimal(Py_UNICODE *s,
9080 Py_ssize_t length,
9081 char *output,
9082 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009083{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009084 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009085 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009086 enum PyUnicode_Kind kind;
9087 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009088
9089 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 PyErr_BadArgument();
9091 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009092 }
9093
Victor Stinner42bf7752011-11-21 22:52:58 +01009094 unicode = PyUnicode_FromUnicode(s, length);
9095 if (unicode == NULL)
9096 return -1;
9097
Benjamin Petersonbac79492012-01-14 13:34:47 -05009098 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009099 Py_DECREF(unicode);
9100 return -1;
9101 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009102 kind = PyUnicode_KIND(unicode);
9103 data = PyUnicode_DATA(unicode);
9104
Victor Stinnerb84d7232011-11-22 01:50:07 +01009105 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009106 PyObject *exc;
9107 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009108 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009109 Py_ssize_t startpos;
9110
9111 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009112
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009114 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009115 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009116 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009117 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009118 decimal = Py_UNICODE_TODECIMAL(ch);
9119 if (decimal >= 0) {
9120 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009121 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 continue;
9123 }
9124 if (0 < ch && ch < 256) {
9125 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009126 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009127 continue;
9128 }
Victor Stinner6345be92011-11-25 20:09:01 +01009129
Victor Stinner42bf7752011-11-21 22:52:58 +01009130 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009131 exc = NULL;
9132 raise_encode_exception(&exc, "decimal", unicode,
9133 startpos, startpos+1,
9134 "invalid decimal Unicode string");
9135 Py_XDECREF(exc);
9136 Py_DECREF(unicode);
9137 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009138 }
9139 /* 0-terminate the output string */
9140 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009141 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009142 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009143}
9144
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145/* --- Helpers ------------------------------------------------------------ */
9146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009148any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149 Py_ssize_t start,
9150 Py_ssize_t end)
9151{
9152 int kind1, kind2, kind;
9153 void *buf1, *buf2;
9154 Py_ssize_t len1, len2, result;
9155
9156 kind1 = PyUnicode_KIND(s1);
9157 kind2 = PyUnicode_KIND(s2);
9158 kind = kind1 > kind2 ? kind1 : kind2;
9159 buf1 = PyUnicode_DATA(s1);
9160 buf2 = PyUnicode_DATA(s2);
9161 if (kind1 != kind)
9162 buf1 = _PyUnicode_AsKind(s1, kind);
9163 if (!buf1)
9164 return -2;
9165 if (kind2 != kind)
9166 buf2 = _PyUnicode_AsKind(s2, kind);
9167 if (!buf2) {
9168 if (kind1 != kind) PyMem_Free(buf1);
9169 return -2;
9170 }
9171 len1 = PyUnicode_GET_LENGTH(s1);
9172 len2 = PyUnicode_GET_LENGTH(s2);
9173
Victor Stinner794d5672011-10-10 03:21:36 +02009174 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009175 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009176 case PyUnicode_1BYTE_KIND:
9177 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9178 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9179 else
9180 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9181 break;
9182 case PyUnicode_2BYTE_KIND:
9183 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9184 break;
9185 case PyUnicode_4BYTE_KIND:
9186 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9187 break;
9188 default:
9189 assert(0); result = -2;
9190 }
9191 }
9192 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009193 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009194 case PyUnicode_1BYTE_KIND:
9195 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9196 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9197 else
9198 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9199 break;
9200 case PyUnicode_2BYTE_KIND:
9201 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9202 break;
9203 case PyUnicode_4BYTE_KIND:
9204 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9205 break;
9206 default:
9207 assert(0); result = -2;
9208 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009209 }
9210
9211 if (kind1 != kind)
9212 PyMem_Free(buf1);
9213 if (kind2 != kind)
9214 PyMem_Free(buf2);
9215
9216 return result;
9217}
9218
9219Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009220_PyUnicode_InsertThousandsGrouping(
9221 PyObject *unicode, Py_ssize_t index,
9222 Py_ssize_t n_buffer,
9223 void *digits, Py_ssize_t n_digits,
9224 Py_ssize_t min_width,
9225 const char *grouping, PyObject *thousands_sep,
9226 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227{
Victor Stinner41a863c2012-02-24 00:37:51 +01009228 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009229 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009230 Py_ssize_t thousands_sep_len;
9231 Py_ssize_t len;
9232
9233 if (unicode != NULL) {
9234 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009235 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009236 }
9237 else {
9238 kind = PyUnicode_1BYTE_KIND;
9239 data = NULL;
9240 }
9241 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9242 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9243 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9244 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009245 if (thousands_sep_kind < kind) {
9246 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9247 if (!thousands_sep_data)
9248 return -1;
9249 }
9250 else {
9251 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9252 if (!data)
9253 return -1;
9254 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009255 }
9256
Benjamin Petersonead6b532011-12-20 17:23:42 -06009257 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009259 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009260 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009261 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009262 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009263 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009264 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009265 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009266 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009267 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009268 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009269 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009271 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009272 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009273 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009274 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009275 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009277 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009278 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009279 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009280 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009281 break;
9282 default:
9283 assert(0);
9284 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009286 if (unicode != NULL && thousands_sep_kind != kind) {
9287 if (thousands_sep_kind < kind)
9288 PyMem_Free(thousands_sep_data);
9289 else
9290 PyMem_Free(data);
9291 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009292 if (unicode == NULL) {
9293 *maxchar = 127;
9294 if (len != n_digits) {
9295 *maxchar = Py_MAX(*maxchar,
9296 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9297 }
9298 }
9299 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300}
9301
9302
Thomas Wouters477c8d52006-05-27 19:21:47 +00009303/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009304#define ADJUST_INDICES(start, end, len) \
9305 if (end > len) \
9306 end = len; \
9307 else if (end < 0) { \
9308 end += len; \
9309 if (end < 0) \
9310 end = 0; \
9311 } \
9312 if (start < 0) { \
9313 start += len; \
9314 if (start < 0) \
9315 start = 0; \
9316 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009317
Alexander Belopolsky40018472011-02-26 01:02:56 +00009318Py_ssize_t
9319PyUnicode_Count(PyObject *str,
9320 PyObject *substr,
9321 Py_ssize_t start,
9322 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009323{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009324 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009325 PyObject* str_obj;
9326 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 int kind1, kind2, kind;
9328 void *buf1 = NULL, *buf2 = NULL;
9329 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009330
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009331 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009332 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009333 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009334 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009335 if (!sub_obj) {
9336 Py_DECREF(str_obj);
9337 return -1;
9338 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009339 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009340 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009341 Py_DECREF(str_obj);
9342 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343 }
Tim Petersced69f82003-09-16 20:30:58 +00009344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 kind1 = PyUnicode_KIND(str_obj);
9346 kind2 = PyUnicode_KIND(sub_obj);
9347 kind = kind1 > kind2 ? kind1 : kind2;
9348 buf1 = PyUnicode_DATA(str_obj);
9349 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009350 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 if (!buf1)
9352 goto onError;
9353 buf2 = PyUnicode_DATA(sub_obj);
9354 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009355 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 if (!buf2)
9357 goto onError;
9358 len1 = PyUnicode_GET_LENGTH(str_obj);
9359 len2 = PyUnicode_GET_LENGTH(sub_obj);
9360
9361 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009362 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009364 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9365 result = asciilib_count(
9366 ((Py_UCS1*)buf1) + start, end - start,
9367 buf2, len2, PY_SSIZE_T_MAX
9368 );
9369 else
9370 result = ucs1lib_count(
9371 ((Py_UCS1*)buf1) + start, end - start,
9372 buf2, len2, PY_SSIZE_T_MAX
9373 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 break;
9375 case PyUnicode_2BYTE_KIND:
9376 result = ucs2lib_count(
9377 ((Py_UCS2*)buf1) + start, end - start,
9378 buf2, len2, PY_SSIZE_T_MAX
9379 );
9380 break;
9381 case PyUnicode_4BYTE_KIND:
9382 result = ucs4lib_count(
9383 ((Py_UCS4*)buf1) + start, end - start,
9384 buf2, len2, PY_SSIZE_T_MAX
9385 );
9386 break;
9387 default:
9388 assert(0); result = 0;
9389 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009390
9391 Py_DECREF(sub_obj);
9392 Py_DECREF(str_obj);
9393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 if (kind1 != kind)
9395 PyMem_Free(buf1);
9396 if (kind2 != kind)
9397 PyMem_Free(buf2);
9398
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 onError:
9401 Py_DECREF(sub_obj);
9402 Py_DECREF(str_obj);
9403 if (kind1 != kind && buf1)
9404 PyMem_Free(buf1);
9405 if (kind2 != kind && buf2)
9406 PyMem_Free(buf2);
9407 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408}
9409
Alexander Belopolsky40018472011-02-26 01:02:56 +00009410Py_ssize_t
9411PyUnicode_Find(PyObject *str,
9412 PyObject *sub,
9413 Py_ssize_t start,
9414 Py_ssize_t end,
9415 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009417 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009418
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009420 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009421 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009422 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009423 if (!sub) {
9424 Py_DECREF(str);
9425 return -2;
9426 }
9427 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9428 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 Py_DECREF(str);
9430 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431 }
Tim Petersced69f82003-09-16 20:30:58 +00009432
Victor Stinner794d5672011-10-10 03:21:36 +02009433 result = any_find_slice(direction,
9434 str, sub, start, end
9435 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009436
Guido van Rossumd57fd912000-03-10 22:53:23 +00009437 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009438 Py_DECREF(sub);
9439
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440 return result;
9441}
9442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443Py_ssize_t
9444PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9445 Py_ssize_t start, Py_ssize_t end,
9446 int direction)
9447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009449 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 if (PyUnicode_READY(str) == -1)
9451 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009452 if (start < 0 || end < 0) {
9453 PyErr_SetString(PyExc_IndexError, "string index out of range");
9454 return -2;
9455 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 if (end > PyUnicode_GET_LENGTH(str))
9457 end = PyUnicode_GET_LENGTH(str);
9458 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009459 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9460 kind, end-start, ch, direction);
9461 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009463 else
9464 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465}
9466
Alexander Belopolsky40018472011-02-26 01:02:56 +00009467static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009468tailmatch(PyObject *self,
9469 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009470 Py_ssize_t start,
9471 Py_ssize_t end,
9472 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 int kind_self;
9475 int kind_sub;
9476 void *data_self;
9477 void *data_sub;
9478 Py_ssize_t offset;
9479 Py_ssize_t i;
9480 Py_ssize_t end_sub;
9481
9482 if (PyUnicode_READY(self) == -1 ||
9483 PyUnicode_READY(substring) == -1)
9484 return 0;
9485
9486 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487 return 1;
9488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9490 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009492 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 kind_self = PyUnicode_KIND(self);
9495 data_self = PyUnicode_DATA(self);
9496 kind_sub = PyUnicode_KIND(substring);
9497 data_sub = PyUnicode_DATA(substring);
9498 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9499
9500 if (direction > 0)
9501 offset = end;
9502 else
9503 offset = start;
9504
9505 if (PyUnicode_READ(kind_self, data_self, offset) ==
9506 PyUnicode_READ(kind_sub, data_sub, 0) &&
9507 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9508 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9509 /* If both are of the same kind, memcmp is sufficient */
9510 if (kind_self == kind_sub) {
9511 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009512 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 data_sub,
9514 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009515 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 }
9517 /* otherwise we have to compare each character by first accesing it */
9518 else {
9519 /* We do not need to compare 0 and len(substring)-1 because
9520 the if statement above ensured already that they are equal
9521 when we end up here. */
9522 // TODO: honor direction and do a forward or backwards search
9523 for (i = 1; i < end_sub; ++i) {
9524 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9525 PyUnicode_READ(kind_sub, data_sub, i))
9526 return 0;
9527 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009528 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 }
9531
9532 return 0;
9533}
9534
Alexander Belopolsky40018472011-02-26 01:02:56 +00009535Py_ssize_t
9536PyUnicode_Tailmatch(PyObject *str,
9537 PyObject *substr,
9538 Py_ssize_t start,
9539 Py_ssize_t end,
9540 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009542 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009543
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544 str = PyUnicode_FromObject(str);
9545 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547 substr = PyUnicode_FromObject(substr);
9548 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 Py_DECREF(str);
9550 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551 }
Tim Petersced69f82003-09-16 20:30:58 +00009552
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009553 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009554 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009555 Py_DECREF(str);
9556 Py_DECREF(substr);
9557 return result;
9558}
9559
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560/* Apply fixfct filter to the Unicode object self and return a
9561 reference to the modified object */
9562
Alexander Belopolsky40018472011-02-26 01:02:56 +00009563static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009564fixup(PyObject *self,
9565 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 PyObject *u;
9568 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009569 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009571 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009574 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 /* fix functions return the new maximum character in a string,
9577 if the kind of the resulting unicode object does not change,
9578 everything is fine. Otherwise we need to change the string kind
9579 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009580 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009581
9582 if (maxchar_new == 0) {
9583 /* no changes */;
9584 if (PyUnicode_CheckExact(self)) {
9585 Py_DECREF(u);
9586 Py_INCREF(self);
9587 return self;
9588 }
9589 else
9590 return u;
9591 }
9592
9593 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 maxchar_new = 127;
9595 else if (maxchar_new <= 255)
9596 maxchar_new = 255;
9597 else if (maxchar_new <= 65535)
9598 maxchar_new = 65535;
9599 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009600 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601
Victor Stinnereaab6042011-12-11 22:22:39 +01009602 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009604
9605 /* In case the maximum character changed, we need to
9606 convert the string to the new category. */
9607 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9608 if (v == NULL) {
9609 Py_DECREF(u);
9610 return NULL;
9611 }
9612 if (maxchar_new > maxchar_old) {
9613 /* If the maxchar increased so that the kind changed, not all
9614 characters are representable anymore and we need to fix the
9615 string again. This only happens in very few cases. */
9616 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9617 maxchar_old = fixfct(v);
9618 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 }
9620 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009621 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009623 Py_DECREF(u);
9624 assert(_PyUnicode_CheckConsistency(v, 1));
9625 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626}
9627
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009628static PyObject *
9629ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009631 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9632 char *resdata, *data = PyUnicode_DATA(self);
9633 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009634
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009635 res = PyUnicode_New(len, 127);
9636 if (res == NULL)
9637 return NULL;
9638 resdata = PyUnicode_DATA(res);
9639 if (lower)
9640 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009642 _Py_bytes_upper(resdata, data, len);
9643 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644}
9645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009647handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009649 Py_ssize_t j;
9650 int final_sigma;
9651 Py_UCS4 c;
9652 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009653
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009654 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9655
9656 where ! is a negation and \p{xxx} is a character with property xxx.
9657 */
9658 for (j = i - 1; j >= 0; j--) {
9659 c = PyUnicode_READ(kind, data, j);
9660 if (!_PyUnicode_IsCaseIgnorable(c))
9661 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9664 if (final_sigma) {
9665 for (j = i + 1; j < length; j++) {
9666 c = PyUnicode_READ(kind, data, j);
9667 if (!_PyUnicode_IsCaseIgnorable(c))
9668 break;
9669 }
9670 final_sigma = j == length || !_PyUnicode_IsCased(c);
9671 }
9672 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673}
9674
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675static int
9676lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9677 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679 /* Obscure special case. */
9680 if (c == 0x3A3) {
9681 mapped[0] = handle_capital_sigma(kind, data, length, i);
9682 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009684 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685}
9686
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009687static Py_ssize_t
9688do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690 Py_ssize_t i, k = 0;
9691 int n_res, j;
9692 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009693
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 c = PyUnicode_READ(kind, data, 0);
9695 n_res = _PyUnicode_ToUpperFull(c, mapped);
9696 for (j = 0; j < n_res; j++) {
9697 if (mapped[j] > *maxchar)
9698 *maxchar = mapped[j];
9699 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701 for (i = 1; i < length; i++) {
9702 c = PyUnicode_READ(kind, data, i);
9703 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9704 for (j = 0; j < n_res; j++) {
9705 if (mapped[j] > *maxchar)
9706 *maxchar = mapped[j];
9707 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009708 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009709 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009710 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711}
9712
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713static Py_ssize_t
9714do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9715 Py_ssize_t i, k = 0;
9716
9717 for (i = 0; i < length; i++) {
9718 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9719 int n_res, j;
9720 if (Py_UNICODE_ISUPPER(c)) {
9721 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9722 }
9723 else if (Py_UNICODE_ISLOWER(c)) {
9724 n_res = _PyUnicode_ToUpperFull(c, mapped);
9725 }
9726 else {
9727 n_res = 1;
9728 mapped[0] = c;
9729 }
9730 for (j = 0; j < n_res; j++) {
9731 if (mapped[j] > *maxchar)
9732 *maxchar = mapped[j];
9733 res[k++] = mapped[j];
9734 }
9735 }
9736 return k;
9737}
9738
9739static Py_ssize_t
9740do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9741 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009743 Py_ssize_t i, k = 0;
9744
9745 for (i = 0; i < length; i++) {
9746 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9747 int n_res, j;
9748 if (lower)
9749 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9750 else
9751 n_res = _PyUnicode_ToUpperFull(c, mapped);
9752 for (j = 0; j < n_res; j++) {
9753 if (mapped[j] > *maxchar)
9754 *maxchar = mapped[j];
9755 res[k++] = mapped[j];
9756 }
9757 }
9758 return k;
9759}
9760
9761static Py_ssize_t
9762do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9763{
9764 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9765}
9766
9767static Py_ssize_t
9768do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9769{
9770 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9771}
9772
Benjamin Petersone51757f2012-01-12 21:10:29 -05009773static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009774do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9775{
9776 Py_ssize_t i, k = 0;
9777
9778 for (i = 0; i < length; i++) {
9779 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9780 Py_UCS4 mapped[3];
9781 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9782 for (j = 0; j < n_res; j++) {
9783 if (mapped[j] > *maxchar)
9784 *maxchar = mapped[j];
9785 res[k++] = mapped[j];
9786 }
9787 }
9788 return k;
9789}
9790
9791static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009792do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9793{
9794 Py_ssize_t i, k = 0;
9795 int previous_is_cased;
9796
9797 previous_is_cased = 0;
9798 for (i = 0; i < length; i++) {
9799 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9800 Py_UCS4 mapped[3];
9801 int n_res, j;
9802
9803 if (previous_is_cased)
9804 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9805 else
9806 n_res = _PyUnicode_ToTitleFull(c, mapped);
9807
9808 for (j = 0; j < n_res; j++) {
9809 if (mapped[j] > *maxchar)
9810 *maxchar = mapped[j];
9811 res[k++] = mapped[j];
9812 }
9813
9814 previous_is_cased = _PyUnicode_IsCased(c);
9815 }
9816 return k;
9817}
9818
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009819static PyObject *
9820case_operation(PyObject *self,
9821 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9822{
9823 PyObject *res = NULL;
9824 Py_ssize_t length, newlength = 0;
9825 int kind, outkind;
9826 void *data, *outdata;
9827 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9828
Benjamin Petersoneea48462012-01-16 14:28:50 -05009829 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009830
9831 kind = PyUnicode_KIND(self);
9832 data = PyUnicode_DATA(self);
9833 length = PyUnicode_GET_LENGTH(self);
9834 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9835 if (tmp == NULL)
9836 return PyErr_NoMemory();
9837 newlength = perform(kind, data, length, tmp, &maxchar);
9838 res = PyUnicode_New(newlength, maxchar);
9839 if (res == NULL)
9840 goto leave;
9841 tmpend = tmp + newlength;
9842 outdata = PyUnicode_DATA(res);
9843 outkind = PyUnicode_KIND(res);
9844 switch (outkind) {
9845 case PyUnicode_1BYTE_KIND:
9846 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9847 break;
9848 case PyUnicode_2BYTE_KIND:
9849 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9850 break;
9851 case PyUnicode_4BYTE_KIND:
9852 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9853 break;
9854 default:
9855 assert(0);
9856 break;
9857 }
9858 leave:
9859 PyMem_FREE(tmp);
9860 return res;
9861}
9862
Tim Peters8ce9f162004-08-27 01:49:32 +00009863PyObject *
9864PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009867 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009869 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009870 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9871 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009872 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009874 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009876 int use_memcpy;
9877 unsigned char *res_data = NULL, *sep_data = NULL;
9878 PyObject *last_obj;
9879 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880
Tim Peters05eba1f2004-08-27 21:32:02 +00009881 fseq = PySequence_Fast(seq, "");
9882 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009883 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009884 }
9885
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009886 /* NOTE: the following code can't call back into Python code,
9887 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009888 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009889
Tim Peters05eba1f2004-08-27 21:32:02 +00009890 seqlen = PySequence_Fast_GET_SIZE(fseq);
9891 /* If empty sequence, return u"". */
9892 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009893 Py_DECREF(fseq);
9894 Py_INCREF(unicode_empty);
9895 res = unicode_empty;
9896 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009897 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009898
Tim Peters05eba1f2004-08-27 21:32:02 +00009899 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009900 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009901 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009902 if (seqlen == 1) {
9903 if (PyUnicode_CheckExact(items[0])) {
9904 res = items[0];
9905 Py_INCREF(res);
9906 Py_DECREF(fseq);
9907 return res;
9908 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009909 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009910 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009911 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009912 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009913 /* Set up sep and seplen */
9914 if (separator == NULL) {
9915 /* fall back to a blank space separator */
9916 sep = PyUnicode_FromOrdinal(' ');
9917 if (!sep)
9918 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009919 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009920 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009921 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009922 else {
9923 if (!PyUnicode_Check(separator)) {
9924 PyErr_Format(PyExc_TypeError,
9925 "separator: expected str instance,"
9926 " %.80s found",
9927 Py_TYPE(separator)->tp_name);
9928 goto onError;
9929 }
9930 if (PyUnicode_READY(separator))
9931 goto onError;
9932 sep = separator;
9933 seplen = PyUnicode_GET_LENGTH(separator);
9934 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9935 /* inc refcount to keep this code path symmetric with the
9936 above case of a blank separator */
9937 Py_INCREF(sep);
9938 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009939 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009940 }
9941
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009942 /* There are at least two things to join, or else we have a subclass
9943 * of str in the sequence.
9944 * Do a pre-pass to figure out the total amount of space we'll
9945 * need (sz), and see whether all argument are strings.
9946 */
9947 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009948#ifdef Py_DEBUG
9949 use_memcpy = 0;
9950#else
9951 use_memcpy = 1;
9952#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009953 for (i = 0; i < seqlen; i++) {
9954 const Py_ssize_t old_sz = sz;
9955 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009956 if (!PyUnicode_Check(item)) {
9957 PyErr_Format(PyExc_TypeError,
9958 "sequence item %zd: expected str instance,"
9959 " %.80s found",
9960 i, Py_TYPE(item)->tp_name);
9961 goto onError;
9962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 if (PyUnicode_READY(item) == -1)
9964 goto onError;
9965 sz += PyUnicode_GET_LENGTH(item);
9966 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009967 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009968 if (i != 0)
9969 sz += seplen;
9970 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9971 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009972 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009973 goto onError;
9974 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009975 if (use_memcpy && last_obj != NULL) {
9976 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9977 use_memcpy = 0;
9978 }
9979 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009980 }
Tim Petersced69f82003-09-16 20:30:58 +00009981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009983 if (res == NULL)
9984 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009985
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009986 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009987#ifdef Py_DEBUG
9988 use_memcpy = 0;
9989#else
9990 if (use_memcpy) {
9991 res_data = PyUnicode_1BYTE_DATA(res);
9992 kind = PyUnicode_KIND(res);
9993 if (seplen != 0)
9994 sep_data = PyUnicode_1BYTE_DATA(sep);
9995 }
9996#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009998 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009999 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010000 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +020010001 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010002 if (use_memcpy) {
10003 Py_MEMCPY(res_data,
10004 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010005 kind * seplen);
10006 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010007 }
10008 else {
10009 copy_characters(res, res_offset, sep, 0, seplen);
10010 res_offset += seplen;
10011 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010012 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010013 itemlen = PyUnicode_GET_LENGTH(item);
10014 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010015 if (use_memcpy) {
10016 Py_MEMCPY(res_data,
10017 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010018 kind * itemlen);
10019 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010020 }
10021 else {
10022 copy_characters(res, res_offset, item, 0, itemlen);
10023 res_offset += itemlen;
10024 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010025 }
Tim Peters05eba1f2004-08-27 21:32:02 +000010026 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010027 if (use_memcpy)
10028 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010029 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +020010030 else
10031 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +000010032
Tim Peters05eba1f2004-08-27 21:32:02 +000010033 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010035 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037
Benjamin Peterson29060642009-01-31 22:14:21 +000010038 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010039 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010041 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042 return NULL;
10043}
10044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045#define FILL(kind, data, value, start, length) \
10046 do { \
10047 Py_ssize_t i_ = 0; \
10048 assert(kind != PyUnicode_WCHAR_KIND); \
10049 switch ((kind)) { \
10050 case PyUnicode_1BYTE_KIND: { \
10051 unsigned char * to_ = (unsigned char *)((data)) + (start); \
10052 memset(to_, (unsigned char)value, length); \
10053 break; \
10054 } \
10055 case PyUnicode_2BYTE_KIND: { \
10056 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10057 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10058 break; \
10059 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010060 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10062 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10063 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010064 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 } \
10066 } \
10067 } while (0)
10068
Victor Stinner3fe55312012-01-04 00:33:50 +010010069Py_ssize_t
10070PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10071 Py_UCS4 fill_char)
10072{
10073 Py_ssize_t maxlen;
10074 enum PyUnicode_Kind kind;
10075 void *data;
10076
10077 if (!PyUnicode_Check(unicode)) {
10078 PyErr_BadInternalCall();
10079 return -1;
10080 }
10081 if (PyUnicode_READY(unicode) == -1)
10082 return -1;
10083 if (unicode_check_modifiable(unicode))
10084 return -1;
10085
10086 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10087 PyErr_SetString(PyExc_ValueError,
10088 "fill character is bigger than "
10089 "the string maximum character");
10090 return -1;
10091 }
10092
10093 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10094 length = Py_MIN(maxlen, length);
10095 if (length <= 0)
10096 return 0;
10097
10098 kind = PyUnicode_KIND(unicode);
10099 data = PyUnicode_DATA(unicode);
10100 FILL(kind, data, fill_char, start, length);
10101 return length;
10102}
10103
Victor Stinner9310abb2011-10-05 00:59:23 +020010104static PyObject *
10105pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010106 Py_ssize_t left,
10107 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010109{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 PyObject *u;
10111 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010112 int kind;
10113 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010114
10115 if (left < 0)
10116 left = 0;
10117 if (right < 0)
10118 right = 0;
10119
Victor Stinnerc4b49542011-12-11 22:44:26 +010010120 if (left == 0 && right == 0)
10121 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10124 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010125 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10126 return NULL;
10127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10129 if (fill > maxchar)
10130 maxchar = fill;
10131 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010132 if (!u)
10133 return NULL;
10134
10135 kind = PyUnicode_KIND(u);
10136 data = PyUnicode_DATA(u);
10137 if (left)
10138 FILL(kind, data, fill, 0, left);
10139 if (right)
10140 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010141 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010142 assert(_PyUnicode_CheckConsistency(u, 1));
10143 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144}
10145
Alexander Belopolsky40018472011-02-26 01:02:56 +000010146PyObject *
10147PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150
10151 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010152 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010153 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010154 if (PyUnicode_READY(string) == -1) {
10155 Py_DECREF(string);
10156 return NULL;
10157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158
Benjamin Petersonead6b532011-12-20 17:23:42 -060010159 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010161 if (PyUnicode_IS_ASCII(string))
10162 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010163 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010164 PyUnicode_GET_LENGTH(string), keepends);
10165 else
10166 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010167 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010168 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 break;
10170 case PyUnicode_2BYTE_KIND:
10171 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010172 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 PyUnicode_GET_LENGTH(string), keepends);
10174 break;
10175 case PyUnicode_4BYTE_KIND:
10176 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010177 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 PyUnicode_GET_LENGTH(string), keepends);
10179 break;
10180 default:
10181 assert(0);
10182 list = 0;
10183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184 Py_DECREF(string);
10185 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186}
10187
Alexander Belopolsky40018472011-02-26 01:02:56 +000010188static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010189split(PyObject *self,
10190 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010191 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 int kind1, kind2, kind;
10194 void *buf1, *buf2;
10195 Py_ssize_t len1, len2;
10196 PyObject* out;
10197
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010199 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 if (PyUnicode_READY(self) == -1)
10202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010205 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010207 if (PyUnicode_IS_ASCII(self))
10208 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010209 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 PyUnicode_GET_LENGTH(self), maxcount
10211 );
10212 else
10213 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010214 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010215 PyUnicode_GET_LENGTH(self), maxcount
10216 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 case PyUnicode_2BYTE_KIND:
10218 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010219 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 PyUnicode_GET_LENGTH(self), maxcount
10221 );
10222 case PyUnicode_4BYTE_KIND:
10223 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010224 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 PyUnicode_GET_LENGTH(self), maxcount
10226 );
10227 default:
10228 assert(0);
10229 return NULL;
10230 }
10231
10232 if (PyUnicode_READY(substring) == -1)
10233 return NULL;
10234
10235 kind1 = PyUnicode_KIND(self);
10236 kind2 = PyUnicode_KIND(substring);
10237 kind = kind1 > kind2 ? kind1 : kind2;
10238 buf1 = PyUnicode_DATA(self);
10239 buf2 = PyUnicode_DATA(substring);
10240 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010241 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 if (!buf1)
10243 return NULL;
10244 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010245 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 if (!buf2) {
10247 if (kind1 != kind) PyMem_Free(buf1);
10248 return NULL;
10249 }
10250 len1 = PyUnicode_GET_LENGTH(self);
10251 len2 = PyUnicode_GET_LENGTH(substring);
10252
Benjamin Petersonead6b532011-12-20 17:23:42 -060010253 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010255 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10256 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010257 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010258 else
10259 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010260 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 break;
10262 case PyUnicode_2BYTE_KIND:
10263 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010264 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 break;
10266 case PyUnicode_4BYTE_KIND:
10267 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010268 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 break;
10270 default:
10271 out = NULL;
10272 }
10273 if (kind1 != kind)
10274 PyMem_Free(buf1);
10275 if (kind2 != kind)
10276 PyMem_Free(buf2);
10277 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278}
10279
Alexander Belopolsky40018472011-02-26 01:02:56 +000010280static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010281rsplit(PyObject *self,
10282 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010283 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 int kind1, kind2, kind;
10286 void *buf1, *buf2;
10287 Py_ssize_t len1, len2;
10288 PyObject* out;
10289
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010290 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010291 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 if (PyUnicode_READY(self) == -1)
10294 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010297 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010299 if (PyUnicode_IS_ASCII(self))
10300 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010301 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010302 PyUnicode_GET_LENGTH(self), maxcount
10303 );
10304 else
10305 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010306 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010307 PyUnicode_GET_LENGTH(self), maxcount
10308 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 case PyUnicode_2BYTE_KIND:
10310 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010311 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 PyUnicode_GET_LENGTH(self), maxcount
10313 );
10314 case PyUnicode_4BYTE_KIND:
10315 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010316 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 PyUnicode_GET_LENGTH(self), maxcount
10318 );
10319 default:
10320 assert(0);
10321 return NULL;
10322 }
10323
10324 if (PyUnicode_READY(substring) == -1)
10325 return NULL;
10326
10327 kind1 = PyUnicode_KIND(self);
10328 kind2 = PyUnicode_KIND(substring);
10329 kind = kind1 > kind2 ? kind1 : kind2;
10330 buf1 = PyUnicode_DATA(self);
10331 buf2 = PyUnicode_DATA(substring);
10332 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010333 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 if (!buf1)
10335 return NULL;
10336 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010337 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 if (!buf2) {
10339 if (kind1 != kind) PyMem_Free(buf1);
10340 return NULL;
10341 }
10342 len1 = PyUnicode_GET_LENGTH(self);
10343 len2 = PyUnicode_GET_LENGTH(substring);
10344
Benjamin Petersonead6b532011-12-20 17:23:42 -060010345 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010347 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10348 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010349 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010350 else
10351 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010352 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 break;
10354 case PyUnicode_2BYTE_KIND:
10355 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010356 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 break;
10358 case PyUnicode_4BYTE_KIND:
10359 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010360 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 break;
10362 default:
10363 out = NULL;
10364 }
10365 if (kind1 != kind)
10366 PyMem_Free(buf1);
10367 if (kind2 != kind)
10368 PyMem_Free(buf2);
10369 return out;
10370}
10371
10372static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010373anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10374 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010376 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010378 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10379 return asciilib_find(buf1, len1, buf2, len2, offset);
10380 else
10381 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 case PyUnicode_2BYTE_KIND:
10383 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10384 case PyUnicode_4BYTE_KIND:
10385 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10386 }
10387 assert(0);
10388 return -1;
10389}
10390
10391static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010392anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10393 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010395 switch (kind) {
10396 case PyUnicode_1BYTE_KIND:
10397 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10398 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10399 else
10400 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10401 case PyUnicode_2BYTE_KIND:
10402 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10403 case PyUnicode_4BYTE_KIND:
10404 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10405 }
10406 assert(0);
10407 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010408}
10409
Alexander Belopolsky40018472011-02-26 01:02:56 +000010410static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411replace(PyObject *self, PyObject *str1,
10412 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 PyObject *u;
10415 char *sbuf = PyUnicode_DATA(self);
10416 char *buf1 = PyUnicode_DATA(str1);
10417 char *buf2 = PyUnicode_DATA(str2);
10418 int srelease = 0, release1 = 0, release2 = 0;
10419 int skind = PyUnicode_KIND(self);
10420 int kind1 = PyUnicode_KIND(str1);
10421 int kind2 = PyUnicode_KIND(str2);
10422 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10423 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10424 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010425 int mayshrink;
10426 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427
10428 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010429 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010431 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432
Victor Stinner59de0ee2011-10-07 10:01:28 +020010433 if (str1 == str2)
10434 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 if (skind < kind1)
10436 /* substring too wide to be present */
10437 goto nothing;
10438
Victor Stinner49a0a212011-10-12 23:46:10 +020010439 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10440 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10441 /* Replacing str1 with str2 may cause a maxchar reduction in the
10442 result string. */
10443 mayshrink = (maxchar_str2 < maxchar);
10444 maxchar = Py_MAX(maxchar, maxchar_str2);
10445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010447 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010449 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010451 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010452 Py_UCS4 u1, u2;
10453 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010454 Py_ssize_t index, pos;
10455 char *src;
10456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010458 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10459 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010460 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010463 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010465 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010467
10468 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10469 index = 0;
10470 src = sbuf;
10471 while (--maxcount)
10472 {
10473 pos++;
10474 src += pos * PyUnicode_KIND(self);
10475 slen -= pos;
10476 index += pos;
10477 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10478 if (pos < 0)
10479 break;
10480 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10481 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010482 }
10483 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 int rkind = skind;
10485 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010486 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 if (kind1 < rkind) {
10489 /* widen substring */
10490 buf1 = _PyUnicode_AsKind(str1, rkind);
10491 if (!buf1) goto error;
10492 release1 = 1;
10493 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010494 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010495 if (i < 0)
10496 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 if (rkind > kind2) {
10498 /* widen replacement */
10499 buf2 = _PyUnicode_AsKind(str2, rkind);
10500 if (!buf2) goto error;
10501 release2 = 1;
10502 }
10503 else if (rkind < kind2) {
10504 /* widen self and buf1 */
10505 rkind = kind2;
10506 if (release1) PyMem_Free(buf1);
10507 sbuf = _PyUnicode_AsKind(self, rkind);
10508 if (!sbuf) goto error;
10509 srelease = 1;
10510 buf1 = _PyUnicode_AsKind(str1, rkind);
10511 if (!buf1) goto error;
10512 release1 = 1;
10513 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010514 u = PyUnicode_New(slen, maxchar);
10515 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 assert(PyUnicode_KIND(u) == rkind);
10518 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010519
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010520 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010521 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010522 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010524 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010526
10527 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010528 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010529 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010530 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010531 if (i == -1)
10532 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010533 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010535 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010539 }
10540 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 Py_ssize_t n, i, j, ires;
10542 Py_ssize_t product, new_size;
10543 int rkind = skind;
10544 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010547 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 buf1 = _PyUnicode_AsKind(str1, rkind);
10549 if (!buf1) goto error;
10550 release1 = 1;
10551 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010552 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010553 if (n == 0)
10554 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010556 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 buf2 = _PyUnicode_AsKind(str2, rkind);
10558 if (!buf2) goto error;
10559 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010562 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 rkind = kind2;
10564 sbuf = _PyUnicode_AsKind(self, rkind);
10565 if (!sbuf) goto error;
10566 srelease = 1;
10567 if (release1) PyMem_Free(buf1);
10568 buf1 = _PyUnicode_AsKind(str1, rkind);
10569 if (!buf1) goto error;
10570 release1 = 1;
10571 }
10572 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10573 PyUnicode_GET_LENGTH(str1))); */
10574 product = n * (len2-len1);
10575 if ((product / (len2-len1)) != n) {
10576 PyErr_SetString(PyExc_OverflowError,
10577 "replace string is too long");
10578 goto error;
10579 }
10580 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010581 if (new_size == 0) {
10582 Py_INCREF(unicode_empty);
10583 u = unicode_empty;
10584 goto done;
10585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10587 PyErr_SetString(PyExc_OverflowError,
10588 "replace string is too long");
10589 goto error;
10590 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010591 u = PyUnicode_New(new_size, maxchar);
10592 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010594 assert(PyUnicode_KIND(u) == rkind);
10595 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 ires = i = 0;
10597 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010598 while (n-- > 0) {
10599 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010600 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010601 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010602 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010603 if (j == -1)
10604 break;
10605 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010606 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010607 memcpy(res + rkind * ires,
10608 sbuf + rkind * i,
10609 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611 }
10612 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010614 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010616 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010622 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010623 memcpy(res + rkind * ires,
10624 sbuf + rkind * i,
10625 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010626 }
10627 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010628 /* interleave */
10629 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010630 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010632 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010634 if (--n <= 0)
10635 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010636 memcpy(res + rkind * ires,
10637 sbuf + rkind * i,
10638 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 ires++;
10640 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010641 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010642 memcpy(res + rkind * ires,
10643 sbuf + rkind * i,
10644 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010645 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010646 }
10647
10648 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010649 unicode_adjust_maxchar(&u);
10650 if (u == NULL)
10651 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010652 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010653
10654 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (srelease)
10656 PyMem_FREE(sbuf);
10657 if (release1)
10658 PyMem_FREE(buf1);
10659 if (release2)
10660 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010661 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663
Benjamin Peterson29060642009-01-31 22:14:21 +000010664 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 if (srelease)
10667 PyMem_FREE(sbuf);
10668 if (release1)
10669 PyMem_FREE(buf1);
10670 if (release2)
10671 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010672 return unicode_result_unchanged(self);
10673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 error:
10675 if (srelease && sbuf)
10676 PyMem_FREE(sbuf);
10677 if (release1 && buf1)
10678 PyMem_FREE(buf1);
10679 if (release2 && buf2)
10680 PyMem_FREE(buf2);
10681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682}
10683
10684/* --- Unicode Object Methods --------------------------------------------- */
10685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010686PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010687 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688\n\
10689Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010690characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691
10692static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010693unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010695 if (PyUnicode_READY(self) == -1)
10696 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010697 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698}
10699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010700PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010701 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702\n\
10703Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010704have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705
10706static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010707unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010709 if (PyUnicode_READY(self) == -1)
10710 return NULL;
10711 if (PyUnicode_GET_LENGTH(self) == 0)
10712 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010713 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714}
10715
Benjamin Petersond5890c82012-01-14 13:23:30 -050010716PyDoc_STRVAR(casefold__doc__,
10717 "S.casefold() -> str\n\
10718\n\
10719Return a version of S suitable for caseless comparisons.");
10720
10721static PyObject *
10722unicode_casefold(PyObject *self)
10723{
10724 if (PyUnicode_READY(self) == -1)
10725 return NULL;
10726 if (PyUnicode_IS_ASCII(self))
10727 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010728 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010729}
10730
10731
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010732/* Argument converter. Coerces to a single unicode character */
10733
10734static int
10735convert_uc(PyObject *obj, void *addr)
10736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010738 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010739
Benjamin Peterson14339b62009-01-31 16:36:08 +000010740 uniobj = PyUnicode_FromObject(obj);
10741 if (uniobj == NULL) {
10742 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010743 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010744 return 0;
10745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010747 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010748 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010749 Py_DECREF(uniobj);
10750 return 0;
10751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010753 Py_DECREF(uniobj);
10754 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010755}
10756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010757PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010758 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010760Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010761done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762
10763static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010764unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010766 Py_ssize_t marg, left;
10767 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 Py_UCS4 fillchar = ' ';
10769
Victor Stinnere9a29352011-10-01 02:14:59 +020010770 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772
Benjamin Petersonbac79492012-01-14 13:34:47 -050010773 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 return NULL;
10775
Victor Stinnerc4b49542011-12-11 22:44:26 +010010776 if (PyUnicode_GET_LENGTH(self) >= width)
10777 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778
Victor Stinnerc4b49542011-12-11 22:44:26 +010010779 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780 left = marg / 2 + (marg & width & 1);
10781
Victor Stinner9310abb2011-10-05 00:59:23 +020010782 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783}
10784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785/* This function assumes that str1 and str2 are readied by the caller. */
10786
Marc-André Lemburge5034372000-08-08 08:04:29 +000010787static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010788unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010789{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 int kind1, kind2;
10791 void *data1, *data2;
10792 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 kind1 = PyUnicode_KIND(str1);
10795 kind2 = PyUnicode_KIND(str2);
10796 data1 = PyUnicode_DATA(str1);
10797 data2 = PyUnicode_DATA(str2);
10798 len1 = PyUnicode_GET_LENGTH(str1);
10799 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 for (i = 0; i < len1 && i < len2; ++i) {
10802 Py_UCS4 c1, c2;
10803 c1 = PyUnicode_READ(kind1, data1, i);
10804 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010805
10806 if (c1 != c2)
10807 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010808 }
10809
10810 return (len1 < len2) ? -1 : (len1 != len2);
10811}
10812
Alexander Belopolsky40018472011-02-26 01:02:56 +000010813int
10814PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10817 if (PyUnicode_READY(left) == -1 ||
10818 PyUnicode_READY(right) == -1)
10819 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010820 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010822 PyErr_Format(PyExc_TypeError,
10823 "Can't compare %.100s and %.100s",
10824 left->ob_type->tp_name,
10825 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826 return -1;
10827}
10828
Martin v. Löwis5b222132007-06-10 09:51:05 +000010829int
10830PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10831{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 Py_ssize_t i;
10833 int kind;
10834 void *data;
10835 Py_UCS4 chr;
10836
Victor Stinner910337b2011-10-03 03:20:16 +020010837 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 if (PyUnicode_READY(uni) == -1)
10839 return -1;
10840 kind = PyUnicode_KIND(uni);
10841 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010842 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10844 if (chr != str[i])
10845 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010846 /* This check keeps Python strings that end in '\0' from comparing equal
10847 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010850 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010851 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010852 return 0;
10853}
10854
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010855
Benjamin Peterson29060642009-01-31 22:14:21 +000010856#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010857 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010858
Alexander Belopolsky40018472011-02-26 01:02:56 +000010859PyObject *
10860PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010861{
10862 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010863
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010864 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10865 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 if (PyUnicode_READY(left) == -1 ||
10867 PyUnicode_READY(right) == -1)
10868 return NULL;
10869 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10870 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010871 if (op == Py_EQ) {
10872 Py_INCREF(Py_False);
10873 return Py_False;
10874 }
10875 if (op == Py_NE) {
10876 Py_INCREF(Py_True);
10877 return Py_True;
10878 }
10879 }
10880 if (left == right)
10881 result = 0;
10882 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010883 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010884
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010885 /* Convert the return value to a Boolean */
10886 switch (op) {
10887 case Py_EQ:
10888 v = TEST_COND(result == 0);
10889 break;
10890 case Py_NE:
10891 v = TEST_COND(result != 0);
10892 break;
10893 case Py_LE:
10894 v = TEST_COND(result <= 0);
10895 break;
10896 case Py_GE:
10897 v = TEST_COND(result >= 0);
10898 break;
10899 case Py_LT:
10900 v = TEST_COND(result == -1);
10901 break;
10902 case Py_GT:
10903 v = TEST_COND(result == 1);
10904 break;
10905 default:
10906 PyErr_BadArgument();
10907 return NULL;
10908 }
10909 Py_INCREF(v);
10910 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010911 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010912
Brian Curtindfc80e32011-08-10 20:28:54 -050010913 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010914}
10915
Alexander Belopolsky40018472011-02-26 01:02:56 +000010916int
10917PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010918{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010919 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010920 int kind1, kind2, kind;
10921 void *buf1, *buf2;
10922 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010923 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010924
10925 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010926 sub = PyUnicode_FromObject(element);
10927 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010928 PyErr_Format(PyExc_TypeError,
10929 "'in <string>' requires string as left operand, not %s",
10930 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010931 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010932 }
10933
Thomas Wouters477c8d52006-05-27 19:21:47 +000010934 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010935 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010936 Py_DECREF(sub);
10937 return -1;
10938 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010939 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10940 Py_DECREF(sub);
10941 Py_DECREF(str);
10942 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 kind1 = PyUnicode_KIND(str);
10945 kind2 = PyUnicode_KIND(sub);
10946 kind = kind1 > kind2 ? kind1 : kind2;
10947 buf1 = PyUnicode_DATA(str);
10948 buf2 = PyUnicode_DATA(sub);
10949 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010950 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 if (!buf1) {
10952 Py_DECREF(sub);
10953 return -1;
10954 }
10955 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010956 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 if (!buf2) {
10958 Py_DECREF(sub);
10959 if (kind1 != kind) PyMem_Free(buf1);
10960 return -1;
10961 }
10962 len1 = PyUnicode_GET_LENGTH(str);
10963 len2 = PyUnicode_GET_LENGTH(sub);
10964
Benjamin Petersonead6b532011-12-20 17:23:42 -060010965 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 case PyUnicode_1BYTE_KIND:
10967 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10968 break;
10969 case PyUnicode_2BYTE_KIND:
10970 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10971 break;
10972 case PyUnicode_4BYTE_KIND:
10973 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10974 break;
10975 default:
10976 result = -1;
10977 assert(0);
10978 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010979
10980 Py_DECREF(str);
10981 Py_DECREF(sub);
10982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983 if (kind1 != kind)
10984 PyMem_Free(buf1);
10985 if (kind2 != kind)
10986 PyMem_Free(buf2);
10987
Guido van Rossum403d68b2000-03-13 15:55:09 +000010988 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010989}
10990
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991/* Concat to string or Unicode object giving a new Unicode object. */
10992
Alexander Belopolsky40018472011-02-26 01:02:56 +000010993PyObject *
10994PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010997 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010998 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999
11000 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011006 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007
11008 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011009 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011013 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016 }
11017
Victor Stinner488fa492011-12-12 00:01:39 +010011018 u_len = PyUnicode_GET_LENGTH(u);
11019 v_len = PyUnicode_GET_LENGTH(v);
11020 if (u_len > PY_SSIZE_T_MAX - v_len) {
11021 PyErr_SetString(PyExc_OverflowError,
11022 "strings are too large to concat");
11023 goto onError;
11024 }
11025 new_len = u_len + v_len;
11026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011028 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
11029 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011032 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011034 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010011035 copy_characters(w, 0, u, 0, u_len);
11036 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 Py_DECREF(u);
11038 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011039 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041
Benjamin Peterson29060642009-01-31 22:14:21 +000011042 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043 Py_XDECREF(u);
11044 Py_XDECREF(v);
11045 return NULL;
11046}
11047
Walter Dörwald1ab83302007-05-18 17:15:44 +000011048void
Victor Stinner23e56682011-10-03 03:54:37 +020011049PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011050{
Victor Stinner23e56682011-10-03 03:54:37 +020011051 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011052 Py_UCS4 maxchar, maxchar2;
11053 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011054
11055 if (p_left == NULL) {
11056 if (!PyErr_Occurred())
11057 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011058 return;
11059 }
Victor Stinner23e56682011-10-03 03:54:37 +020011060 left = *p_left;
11061 if (right == NULL || !PyUnicode_Check(left)) {
11062 if (!PyErr_Occurred())
11063 PyErr_BadInternalCall();
11064 goto error;
11065 }
11066
Benjamin Petersonbac79492012-01-14 13:34:47 -050011067 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011068 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011069 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011070 goto error;
11071
Victor Stinner488fa492011-12-12 00:01:39 +010011072 /* Shortcuts */
11073 if (left == unicode_empty) {
11074 Py_DECREF(left);
11075 Py_INCREF(right);
11076 *p_left = right;
11077 return;
11078 }
11079 if (right == unicode_empty)
11080 return;
11081
11082 left_len = PyUnicode_GET_LENGTH(left);
11083 right_len = PyUnicode_GET_LENGTH(right);
11084 if (left_len > PY_SSIZE_T_MAX - right_len) {
11085 PyErr_SetString(PyExc_OverflowError,
11086 "strings are too large to concat");
11087 goto error;
11088 }
11089 new_len = left_len + right_len;
11090
11091 if (unicode_modifiable(left)
11092 && PyUnicode_CheckExact(right)
11093 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011094 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11095 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011096 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011097 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011098 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11099 {
11100 /* append inplace */
11101 if (unicode_resize(p_left, new_len) != 0) {
11102 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11103 * deallocated so it cannot be put back into
11104 * 'variable'. The MemoryError is raised when there
11105 * is no value in 'variable', which might (very
11106 * remotely) be a cause of incompatibilities.
11107 */
11108 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011109 }
Victor Stinner488fa492011-12-12 00:01:39 +010011110 /* copy 'right' into the newly allocated area of 'left' */
11111 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011112 }
Victor Stinner488fa492011-12-12 00:01:39 +010011113 else {
11114 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11115 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11116 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011117
Victor Stinner488fa492011-12-12 00:01:39 +010011118 /* Concat the two Unicode strings */
11119 res = PyUnicode_New(new_len, maxchar);
11120 if (res == NULL)
11121 goto error;
11122 copy_characters(res, 0, left, 0, left_len);
11123 copy_characters(res, left_len, right, 0, right_len);
11124 Py_DECREF(left);
11125 *p_left = res;
11126 }
11127 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011128 return;
11129
11130error:
Victor Stinner488fa492011-12-12 00:01:39 +010011131 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011132}
11133
11134void
11135PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11136{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011137 PyUnicode_Append(pleft, right);
11138 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011139}
11140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011141PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011142 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011144Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011145string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011146interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147
11148static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011149unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011151 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011152 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011153 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 int kind1, kind2, kind;
11156 void *buf1, *buf2;
11157 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158
Jesus Ceaac451502011-04-20 17:09:23 +020011159 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11160 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011161 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 kind1 = PyUnicode_KIND(self);
11164 kind2 = PyUnicode_KIND(substring);
11165 kind = kind1 > kind2 ? kind1 : kind2;
11166 buf1 = PyUnicode_DATA(self);
11167 buf2 = PyUnicode_DATA(substring);
11168 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011169 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 if (!buf1) {
11171 Py_DECREF(substring);
11172 return NULL;
11173 }
11174 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011175 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176 if (!buf2) {
11177 Py_DECREF(substring);
11178 if (kind1 != kind) PyMem_Free(buf1);
11179 return NULL;
11180 }
11181 len1 = PyUnicode_GET_LENGTH(self);
11182 len2 = PyUnicode_GET_LENGTH(substring);
11183
11184 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011185 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 case PyUnicode_1BYTE_KIND:
11187 iresult = ucs1lib_count(
11188 ((Py_UCS1*)buf1) + start, end - start,
11189 buf2, len2, PY_SSIZE_T_MAX
11190 );
11191 break;
11192 case PyUnicode_2BYTE_KIND:
11193 iresult = ucs2lib_count(
11194 ((Py_UCS2*)buf1) + start, end - start,
11195 buf2, len2, PY_SSIZE_T_MAX
11196 );
11197 break;
11198 case PyUnicode_4BYTE_KIND:
11199 iresult = ucs4lib_count(
11200 ((Py_UCS4*)buf1) + start, end - start,
11201 buf2, len2, PY_SSIZE_T_MAX
11202 );
11203 break;
11204 default:
11205 assert(0); iresult = 0;
11206 }
11207
11208 result = PyLong_FromSsize_t(iresult);
11209
11210 if (kind1 != kind)
11211 PyMem_Free(buf1);
11212 if (kind2 != kind)
11213 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214
11215 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011216
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217 return result;
11218}
11219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011220PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011221 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011223Encode S using the codec registered for encoding. Default encoding\n\
11224is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011225handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011226a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11227'xmlcharrefreplace' as well as any other name registered with\n\
11228codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229
11230static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011231unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011233 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234 char *encoding = NULL;
11235 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011236
Benjamin Peterson308d6372009-09-18 21:42:35 +000011237 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11238 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011240 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011241}
11242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011243PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245\n\
11246Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011247If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
11249static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011250unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011252 Py_ssize_t i, j, line_pos, src_len, incr;
11253 Py_UCS4 ch;
11254 PyObject *u;
11255 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011257 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011258 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259
11260 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262
Antoine Pitrou22425222011-10-04 19:10:51 +020011263 if (PyUnicode_READY(self) == -1)
11264 return NULL;
11265
Thomas Wouters7e474022000-07-16 12:04:32 +000011266 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011267 src_len = PyUnicode_GET_LENGTH(self);
11268 i = j = line_pos = 0;
11269 kind = PyUnicode_KIND(self);
11270 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011271 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011272 for (; i < src_len; i++) {
11273 ch = PyUnicode_READ(kind, src_data, i);
11274 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011275 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011276 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011277 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011278 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011279 goto overflow;
11280 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011282 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011286 goto overflow;
11287 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011289 if (ch == '\n' || ch == '\r')
11290 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011292 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011293 if (!found)
11294 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011295
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011297 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298 if (!u)
11299 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011300 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
Antoine Pitroue71d5742011-10-04 15:55:09 +020011302 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
Antoine Pitroue71d5742011-10-04 15:55:09 +020011304 for (; i < src_len; i++) {
11305 ch = PyUnicode_READ(kind, src_data, i);
11306 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011308 incr = tabsize - (line_pos % tabsize);
11309 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011310 FILL(kind, dest_data, ' ', j, incr);
11311 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011312 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011313 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011314 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011315 line_pos++;
11316 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011317 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011318 if (ch == '\n' || ch == '\r')
11319 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011321 }
11322 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011323 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011324
Antoine Pitroue71d5742011-10-04 15:55:09 +020011325 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011326 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328}
11329
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011330PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011331 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332\n\
11333Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011334such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335arguments start and end are interpreted as in slice notation.\n\
11336\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011337Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338
11339static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011342 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011343 Py_ssize_t start;
11344 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011345 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346
Jesus Ceaac451502011-04-20 17:09:23 +020011347 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11348 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351 if (PyUnicode_READY(self) == -1)
11352 return NULL;
11353 if (PyUnicode_READY(substring) == -1)
11354 return NULL;
11355
Victor Stinner7931d9a2011-11-04 00:22:48 +010011356 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357
11358 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 if (result == -2)
11361 return NULL;
11362
Christian Heimes217cfd12007-12-02 14:31:20 +000011363 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364}
11365
11366static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011367unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011369 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11370 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373}
11374
Guido van Rossumc2504932007-09-18 19:42:40 +000011375/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011376 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011377static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011378unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379{
Guido van Rossumc2504932007-09-18 19:42:40 +000011380 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011381 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011382
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011383#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011384 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011385#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 if (_PyUnicode_HASH(self) != -1)
11387 return _PyUnicode_HASH(self);
11388 if (PyUnicode_READY(self) == -1)
11389 return -1;
11390 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011391 /*
11392 We make the hash of the empty string be 0, rather than using
11393 (prefix ^ suffix), since this slightly obfuscates the hash secret
11394 */
11395 if (len == 0) {
11396 _PyUnicode_HASH(self) = 0;
11397 return 0;
11398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399
11400 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011401#define HASH(P) \
11402 x ^= (Py_uhash_t) *P << 7; \
11403 while (--len >= 0) \
11404 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405
Georg Brandl2fb477c2012-02-21 00:33:36 +010011406 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 switch (PyUnicode_KIND(self)) {
11408 case PyUnicode_1BYTE_KIND: {
11409 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11410 HASH(c);
11411 break;
11412 }
11413 case PyUnicode_2BYTE_KIND: {
11414 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11415 HASH(s);
11416 break;
11417 }
11418 default: {
11419 Py_UCS4 *l;
11420 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11421 "Impossible switch case in unicode_hash");
11422 l = PyUnicode_4BYTE_DATA(self);
11423 HASH(l);
11424 break;
11425 }
11426 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011427 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11428 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429
Guido van Rossumc2504932007-09-18 19:42:40 +000011430 if (x == -1)
11431 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011433 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011437PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011440Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
11442static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011445 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011446 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011447 Py_ssize_t start;
11448 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
Jesus Ceaac451502011-04-20 17:09:23 +020011450 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11451 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 if (PyUnicode_READY(self) == -1)
11455 return NULL;
11456 if (PyUnicode_READY(substring) == -1)
11457 return NULL;
11458
Victor Stinner7931d9a2011-11-04 00:22:48 +010011459 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460
11461 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (result == -2)
11464 return NULL;
11465
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466 if (result < 0) {
11467 PyErr_SetString(PyExc_ValueError, "substring not found");
11468 return NULL;
11469 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011470
Christian Heimes217cfd12007-12-02 14:31:20 +000011471 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472}
11473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011474PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011477Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011478at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479
11480static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011481unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 Py_ssize_t i, length;
11484 int kind;
11485 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486 int cased;
11487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 if (PyUnicode_READY(self) == -1)
11489 return NULL;
11490 length = PyUnicode_GET_LENGTH(self);
11491 kind = PyUnicode_KIND(self);
11492 data = PyUnicode_DATA(self);
11493
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011495 if (length == 1)
11496 return PyBool_FromLong(
11497 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011499 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011501 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011502
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 for (i = 0; i < length; i++) {
11505 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011506
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11508 return PyBool_FromLong(0);
11509 else if (!cased && Py_UNICODE_ISLOWER(ch))
11510 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011512 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513}
11514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011515PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011518Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011519at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520
11521static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011522unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 Py_ssize_t i, length;
11525 int kind;
11526 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 int cased;
11528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 if (PyUnicode_READY(self) == -1)
11530 return NULL;
11531 length = PyUnicode_GET_LENGTH(self);
11532 kind = PyUnicode_KIND(self);
11533 data = PyUnicode_DATA(self);
11534
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 if (length == 1)
11537 return PyBool_FromLong(
11538 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011540 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011543
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 for (i = 0; i < length; i++) {
11546 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011547
Benjamin Peterson29060642009-01-31 22:14:21 +000011548 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11549 return PyBool_FromLong(0);
11550 else if (!cased && Py_UNICODE_ISUPPER(ch))
11551 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011553 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554}
11555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011556PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011559Return True if S is a titlecased string and there is at least one\n\
11560character in S, i.e. upper- and titlecase characters may only\n\
11561follow uncased characters and lowercase characters only cased ones.\n\
11562Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563
11564static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011565unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 Py_ssize_t i, length;
11568 int kind;
11569 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570 int cased, previous_is_cased;
11571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 if (PyUnicode_READY(self) == -1)
11573 return NULL;
11574 length = PyUnicode_GET_LENGTH(self);
11575 kind = PyUnicode_KIND(self);
11576 data = PyUnicode_DATA(self);
11577
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 if (length == 1) {
11580 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11581 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11582 (Py_UNICODE_ISUPPER(ch) != 0));
11583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011585 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011588
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589 cased = 0;
11590 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 for (i = 0; i < length; i++) {
11592 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011593
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11595 if (previous_is_cased)
11596 return PyBool_FromLong(0);
11597 previous_is_cased = 1;
11598 cased = 1;
11599 }
11600 else if (Py_UNICODE_ISLOWER(ch)) {
11601 if (!previous_is_cased)
11602 return PyBool_FromLong(0);
11603 previous_is_cased = 1;
11604 cased = 1;
11605 }
11606 else
11607 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011609 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610}
11611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011612PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011613 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011615Return True if all characters in S are whitespace\n\
11616and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617
11618static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011619unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 Py_ssize_t i, length;
11622 int kind;
11623 void *data;
11624
11625 if (PyUnicode_READY(self) == -1)
11626 return NULL;
11627 length = PyUnicode_GET_LENGTH(self);
11628 kind = PyUnicode_KIND(self);
11629 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 if (length == 1)
11633 return PyBool_FromLong(
11634 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011636 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011638 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 for (i = 0; i < length; i++) {
11641 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011642 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011643 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011645 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646}
11647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011648PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011650\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011651Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011652and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011653
11654static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011655unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011656{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 Py_ssize_t i, length;
11658 int kind;
11659 void *data;
11660
11661 if (PyUnicode_READY(self) == -1)
11662 return NULL;
11663 length = PyUnicode_GET_LENGTH(self);
11664 kind = PyUnicode_KIND(self);
11665 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011666
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011667 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 if (length == 1)
11669 return PyBool_FromLong(
11670 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011671
11672 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 for (i = 0; i < length; i++) {
11677 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011678 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011679 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011680 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011681}
11682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011683PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011685\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011686Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011687and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011688
11689static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011690unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011691{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 int kind;
11693 void *data;
11694 Py_ssize_t len, i;
11695
11696 if (PyUnicode_READY(self) == -1)
11697 return NULL;
11698
11699 kind = PyUnicode_KIND(self);
11700 data = PyUnicode_DATA(self);
11701 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011702
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011703 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 if (len == 1) {
11705 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11706 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11707 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011708
11709 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 for (i = 0; i < len; i++) {
11714 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011715 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011717 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011718 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011719}
11720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011721PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011724Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011725False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726
11727static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011728unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 Py_ssize_t i, length;
11731 int kind;
11732 void *data;
11733
11734 if (PyUnicode_READY(self) == -1)
11735 return NULL;
11736 length = PyUnicode_GET_LENGTH(self);
11737 kind = PyUnicode_KIND(self);
11738 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 if (length == 1)
11742 return PyBool_FromLong(
11743 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011745 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 for (i = 0; i < length; i++) {
11750 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011751 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011753 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754}
11755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011756PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011759Return True if all characters in S are digits\n\
11760and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761
11762static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011763unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 Py_ssize_t i, length;
11766 int kind;
11767 void *data;
11768
11769 if (PyUnicode_READY(self) == -1)
11770 return NULL;
11771 length = PyUnicode_GET_LENGTH(self);
11772 kind = PyUnicode_KIND(self);
11773 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 if (length == 1) {
11777 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11778 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011781 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 for (i = 0; i < length; i++) {
11786 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011789 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790}
11791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011792PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011793 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011795Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011796False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797
11798static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011799unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 Py_ssize_t i, length;
11802 int kind;
11803 void *data;
11804
11805 if (PyUnicode_READY(self) == -1)
11806 return NULL;
11807 length = PyUnicode_GET_LENGTH(self);
11808 kind = PyUnicode_KIND(self);
11809 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 if (length == 1)
11813 return PyBool_FromLong(
11814 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011816 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 for (i = 0; i < length; i++) {
11821 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011822 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011824 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825}
11826
Martin v. Löwis47383402007-08-15 07:32:56 +000011827int
11828PyUnicode_IsIdentifier(PyObject *self)
11829{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 int kind;
11831 void *data;
11832 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011833 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 if (PyUnicode_READY(self) == -1) {
11836 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011837 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 }
11839
11840 /* Special case for empty strings */
11841 if (PyUnicode_GET_LENGTH(self) == 0)
11842 return 0;
11843 kind = PyUnicode_KIND(self);
11844 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011845
11846 /* PEP 3131 says that the first character must be in
11847 XID_Start and subsequent characters in XID_Continue,
11848 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011849 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011850 letters, digits, underscore). However, given the current
11851 definition of XID_Start and XID_Continue, it is sufficient
11852 to check just for these, except that _ must be allowed
11853 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011855 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011856 return 0;
11857
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011858 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011860 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011861 return 1;
11862}
11863
11864PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011866\n\
11867Return True if S is a valid identifier according\n\
11868to the language definition.");
11869
11870static PyObject*
11871unicode_isidentifier(PyObject *self)
11872{
11873 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11874}
11875
Georg Brandl559e5d72008-06-11 18:37:52 +000011876PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011878\n\
11879Return True if all characters in S are considered\n\
11880printable in repr() or S is empty, False otherwise.");
11881
11882static PyObject*
11883unicode_isprintable(PyObject *self)
11884{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 Py_ssize_t i, length;
11886 int kind;
11887 void *data;
11888
11889 if (PyUnicode_READY(self) == -1)
11890 return NULL;
11891 length = PyUnicode_GET_LENGTH(self);
11892 kind = PyUnicode_KIND(self);
11893 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011894
11895 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 if (length == 1)
11897 return PyBool_FromLong(
11898 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 for (i = 0; i < length; i++) {
11901 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011902 Py_RETURN_FALSE;
11903 }
11904 }
11905 Py_RETURN_TRUE;
11906}
11907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011908PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011909 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910\n\
11911Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011912iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913
11914static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011915unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011917 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918}
11919
Martin v. Löwis18e16552006-02-15 17:27:45 +000011920static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011921unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 if (PyUnicode_READY(self) == -1)
11924 return -1;
11925 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926}
11927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011928PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011931Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011932done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933
11934static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011935unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011937 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 Py_UCS4 fillchar = ' ';
11939
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011940 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 return NULL;
11942
Benjamin Petersonbac79492012-01-14 13:34:47 -050011943 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945
Victor Stinnerc4b49542011-12-11 22:44:26 +010011946 if (PyUnicode_GET_LENGTH(self) >= width)
11947 return unicode_result_unchanged(self);
11948
11949 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950}
11951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011952PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011955Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956
11957static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011958unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011960 if (PyUnicode_READY(self) == -1)
11961 return NULL;
11962 if (PyUnicode_IS_ASCII(self))
11963 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011964 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965}
11966
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011967#define LEFTSTRIP 0
11968#define RIGHTSTRIP 1
11969#define BOTHSTRIP 2
11970
11971/* Arrays indexed by above */
11972static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11973
11974#define STRIPNAME(i) (stripformat[i]+3)
11975
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011976/* externally visible for str.strip(unicode) */
11977PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011978_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 void *data;
11981 int kind;
11982 Py_ssize_t i, j, len;
11983 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11986 return NULL;
11987
11988 kind = PyUnicode_KIND(self);
11989 data = PyUnicode_DATA(self);
11990 len = PyUnicode_GET_LENGTH(self);
11991 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11992 PyUnicode_DATA(sepobj),
11993 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011994
Benjamin Peterson14339b62009-01-31 16:36:08 +000011995 i = 0;
11996 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 while (i < len &&
11998 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 i++;
12000 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012001 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012002
Benjamin Peterson14339b62009-01-31 16:36:08 +000012003 j = len;
12004 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 do {
12006 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 } while (j >= i &&
12008 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012010 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012011
Victor Stinner7931d9a2011-11-04 00:22:48 +010012012 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013}
12014
12015PyObject*
12016PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12017{
12018 unsigned char *data;
12019 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012020 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021
Victor Stinnerde636f32011-10-01 03:55:54 +020012022 if (PyUnicode_READY(self) == -1)
12023 return NULL;
12024
12025 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
12026
Victor Stinner12bab6d2011-10-01 01:53:49 +020012027 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010012028 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029
Victor Stinner12bab6d2011-10-01 01:53:49 +020012030 length = end - start;
12031 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012032 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033
Victor Stinnerde636f32011-10-01 03:55:54 +020012034 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012035 PyErr_SetString(PyExc_IndexError, "string index out of range");
12036 return NULL;
12037 }
12038
Victor Stinnerb9275c12011-10-05 14:01:42 +020012039 if (PyUnicode_IS_ASCII(self)) {
12040 kind = PyUnicode_KIND(self);
12041 data = PyUnicode_1BYTE_DATA(self);
12042 return unicode_fromascii(data + start, length);
12043 }
12044 else {
12045 kind = PyUnicode_KIND(self);
12046 data = PyUnicode_1BYTE_DATA(self);
12047 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012048 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012049 length);
12050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052
12053static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012054do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 int kind;
12057 void *data;
12058 Py_ssize_t len, i, j;
12059
12060 if (PyUnicode_READY(self) == -1)
12061 return NULL;
12062
12063 kind = PyUnicode_KIND(self);
12064 data = PyUnicode_DATA(self);
12065 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012066
Benjamin Peterson14339b62009-01-31 16:36:08 +000012067 i = 0;
12068 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012070 i++;
12071 }
12072 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012073
Benjamin Peterson14339b62009-01-31 16:36:08 +000012074 j = len;
12075 if (striptype != LEFTSTRIP) {
12076 do {
12077 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012079 j++;
12080 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012081
Victor Stinner7931d9a2011-11-04 00:22:48 +010012082 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083}
12084
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012085
12086static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012087do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012088{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012089 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012090
Benjamin Peterson14339b62009-01-31 16:36:08 +000012091 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12092 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012093
Benjamin Peterson14339b62009-01-31 16:36:08 +000012094 if (sep != NULL && sep != Py_None) {
12095 if (PyUnicode_Check(sep))
12096 return _PyUnicode_XStrip(self, striptype, sep);
12097 else {
12098 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012099 "%s arg must be None or str",
12100 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012101 return NULL;
12102 }
12103 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012104
Benjamin Peterson14339b62009-01-31 16:36:08 +000012105 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012106}
12107
12108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012109PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012111\n\
12112Return a copy of the string S with leading and trailing\n\
12113whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012114If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012115
12116static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012117unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012118{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012119 if (PyTuple_GET_SIZE(args) == 0)
12120 return do_strip(self, BOTHSTRIP); /* Common case */
12121 else
12122 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012123}
12124
12125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012126PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012127 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012128\n\
12129Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012130If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012131
12132static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012133unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012134{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012135 if (PyTuple_GET_SIZE(args) == 0)
12136 return do_strip(self, LEFTSTRIP); /* Common case */
12137 else
12138 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139}
12140
12141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012142PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012143 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012144\n\
12145Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012146If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012147
12148static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012149unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012150{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012151 if (PyTuple_GET_SIZE(args) == 0)
12152 return do_strip(self, RIGHTSTRIP); /* Common case */
12153 else
12154 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012155}
12156
12157
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012159unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012161 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163
Georg Brandl222de0f2009-04-12 12:01:50 +000012164 if (len < 1) {
12165 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012166 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168
Victor Stinnerc4b49542011-12-11 22:44:26 +010012169 /* no repeat, return original string */
12170 if (len == 1)
12171 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012172
Benjamin Petersonbac79492012-01-14 13:34:47 -050012173 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 return NULL;
12175
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012176 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012177 PyErr_SetString(PyExc_OverflowError,
12178 "repeated string is too long");
12179 return NULL;
12180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012182
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012183 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184 if (!u)
12185 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012186 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 if (PyUnicode_GET_LENGTH(str) == 1) {
12189 const int kind = PyUnicode_KIND(str);
12190 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012191 if (kind == PyUnicode_1BYTE_KIND) {
12192 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012193 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012194 }
12195 else if (kind == PyUnicode_2BYTE_KIND) {
12196 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012197 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012198 ucs2[n] = fill_char;
12199 } else {
12200 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12201 assert(kind == PyUnicode_4BYTE_KIND);
12202 for (n = 0; n < len; ++n)
12203 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012204 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 }
12206 else {
12207 /* number of characters copied this far */
12208 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012209 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 char *to = (char *) PyUnicode_DATA(u);
12211 Py_MEMCPY(to, PyUnicode_DATA(str),
12212 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012213 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 n = (done <= nchars-done) ? done : nchars-done;
12215 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012216 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012217 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218 }
12219
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012220 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012221 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222}
12223
Alexander Belopolsky40018472011-02-26 01:02:56 +000012224PyObject *
12225PyUnicode_Replace(PyObject *obj,
12226 PyObject *subobj,
12227 PyObject *replobj,
12228 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229{
12230 PyObject *self;
12231 PyObject *str1;
12232 PyObject *str2;
12233 PyObject *result;
12234
12235 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012236 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012239 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012240 Py_DECREF(self);
12241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242 }
12243 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012244 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 Py_DECREF(self);
12246 Py_DECREF(str1);
12247 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012249 if (PyUnicode_READY(self) == -1 ||
12250 PyUnicode_READY(str1) == -1 ||
12251 PyUnicode_READY(str2) == -1)
12252 result = NULL;
12253 else
12254 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255 Py_DECREF(self);
12256 Py_DECREF(str1);
12257 Py_DECREF(str2);
12258 return result;
12259}
12260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012261PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012262 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263\n\
12264Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012265old replaced by new. If the optional argument count is\n\
12266given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267
12268static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 PyObject *str1;
12272 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012273 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 PyObject *result;
12275
Martin v. Löwis18e16552006-02-15 17:27:45 +000012276 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012278 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012281 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 return NULL;
12283 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012284 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 Py_DECREF(str1);
12286 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012287 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012288 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12289 result = NULL;
12290 else
12291 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292
12293 Py_DECREF(str1);
12294 Py_DECREF(str2);
12295 return result;
12296}
12297
Alexander Belopolsky40018472011-02-26 01:02:56 +000012298static PyObject *
12299unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012301 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 Py_ssize_t isize;
12303 Py_ssize_t osize, squote, dquote, i, o;
12304 Py_UCS4 max, quote;
12305 int ikind, okind;
12306 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012309 return NULL;
12310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 isize = PyUnicode_GET_LENGTH(unicode);
12312 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 /* Compute length of output, quote characters, and
12315 maximum character */
12316 osize = 2; /* quotes */
12317 max = 127;
12318 squote = dquote = 0;
12319 ikind = PyUnicode_KIND(unicode);
12320 for (i = 0; i < isize; i++) {
12321 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12322 switch (ch) {
12323 case '\'': squote++; osize++; break;
12324 case '"': dquote++; osize++; break;
12325 case '\\': case '\t': case '\r': case '\n':
12326 osize += 2; break;
12327 default:
12328 /* Fast-path ASCII */
12329 if (ch < ' ' || ch == 0x7f)
12330 osize += 4; /* \xHH */
12331 else if (ch < 0x7f)
12332 osize++;
12333 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12334 osize++;
12335 max = ch > max ? ch : max;
12336 }
12337 else if (ch < 0x100)
12338 osize += 4; /* \xHH */
12339 else if (ch < 0x10000)
12340 osize += 6; /* \uHHHH */
12341 else
12342 osize += 10; /* \uHHHHHHHH */
12343 }
12344 }
12345
12346 quote = '\'';
12347 if (squote) {
12348 if (dquote)
12349 /* Both squote and dquote present. Use squote,
12350 and escape them */
12351 osize += squote;
12352 else
12353 quote = '"';
12354 }
12355
12356 repr = PyUnicode_New(osize, max);
12357 if (repr == NULL)
12358 return NULL;
12359 okind = PyUnicode_KIND(repr);
12360 odata = PyUnicode_DATA(repr);
12361
12362 PyUnicode_WRITE(okind, odata, 0, quote);
12363 PyUnicode_WRITE(okind, odata, osize-1, quote);
12364
12365 for (i = 0, o = 1; i < isize; i++) {
12366 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012367
12368 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 if ((ch == quote) || (ch == '\\')) {
12370 PyUnicode_WRITE(okind, odata, o++, '\\');
12371 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012372 continue;
12373 }
12374
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012376 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 PyUnicode_WRITE(okind, odata, o++, '\\');
12378 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012379 }
12380 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381 PyUnicode_WRITE(okind, odata, o++, '\\');
12382 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012383 }
12384 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 PyUnicode_WRITE(okind, odata, o++, '\\');
12386 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012387 }
12388
12389 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012390 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 PyUnicode_WRITE(okind, odata, o++, '\\');
12392 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012393 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12394 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012395 }
12396
Georg Brandl559e5d72008-06-11 18:37:52 +000012397 /* Copy ASCII characters as-is */
12398 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012400 }
12401
Benjamin Peterson29060642009-01-31 22:14:21 +000012402 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012403 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012405 (categories Z* and C* except ASCII space)
12406 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012408 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 if (ch <= 0xff) {
12410 PyUnicode_WRITE(okind, odata, o++, '\\');
12411 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012412 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12413 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012414 }
12415 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 else if (ch >= 0x10000) {
12417 PyUnicode_WRITE(okind, odata, o++, '\\');
12418 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012419 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12420 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12421 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12422 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12423 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12424 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12425 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12426 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012427 }
12428 /* Map 16-bit characters to '\uxxxx' */
12429 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 PyUnicode_WRITE(okind, odata, o++, '\\');
12431 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012432 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12433 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12434 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12435 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012436 }
12437 }
12438 /* Copy characters as-is */
12439 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012441 }
12442 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012445 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012446 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447}
12448
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012449PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012450 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451\n\
12452Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012453such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454arguments start and end are interpreted as in slice notation.\n\
12455\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012456Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457
12458static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012460{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012461 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012462 Py_ssize_t start;
12463 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012464 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465
Jesus Ceaac451502011-04-20 17:09:23 +020012466 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12467 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012468 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 if (PyUnicode_READY(self) == -1)
12471 return NULL;
12472 if (PyUnicode_READY(substring) == -1)
12473 return NULL;
12474
Victor Stinner7931d9a2011-11-04 00:22:48 +010012475 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476
12477 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 if (result == -2)
12480 return NULL;
12481
Christian Heimes217cfd12007-12-02 14:31:20 +000012482 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483}
12484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012485PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012488Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489
12490static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012492{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012493 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012494 Py_ssize_t start;
12495 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012496 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497
Jesus Ceaac451502011-04-20 17:09:23 +020012498 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12499 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 if (PyUnicode_READY(self) == -1)
12503 return NULL;
12504 if (PyUnicode_READY(substring) == -1)
12505 return NULL;
12506
Victor Stinner7931d9a2011-11-04 00:22:48 +010012507 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508
12509 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 if (result == -2)
12512 return NULL;
12513
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514 if (result < 0) {
12515 PyErr_SetString(PyExc_ValueError, "substring not found");
12516 return NULL;
12517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518
Christian Heimes217cfd12007-12-02 14:31:20 +000012519 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520}
12521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012522PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012523 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012525Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012526done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527
12528static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012529unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012531 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 Py_UCS4 fillchar = ' ';
12533
Victor Stinnere9a29352011-10-01 02:14:59 +020012534 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012536
Benjamin Petersonbac79492012-01-14 13:34:47 -050012537 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538 return NULL;
12539
Victor Stinnerc4b49542011-12-11 22:44:26 +010012540 if (PyUnicode_GET_LENGTH(self) >= width)
12541 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542
Victor Stinnerc4b49542011-12-11 22:44:26 +010012543 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544}
12545
Alexander Belopolsky40018472011-02-26 01:02:56 +000012546PyObject *
12547PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548{
12549 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012550
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551 s = PyUnicode_FromObject(s);
12552 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012553 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 if (sep != NULL) {
12555 sep = PyUnicode_FromObject(sep);
12556 if (sep == NULL) {
12557 Py_DECREF(s);
12558 return NULL;
12559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560 }
12561
Victor Stinner9310abb2011-10-05 00:59:23 +020012562 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563
12564 Py_DECREF(s);
12565 Py_XDECREF(sep);
12566 return result;
12567}
12568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012569PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012570 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571\n\
12572Return a list of the words in S, using sep as the\n\
12573delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012574splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012575whitespace string is a separator and empty strings are\n\
12576removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577
12578static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012579unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012581 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012583 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012585 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12586 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587 return NULL;
12588
12589 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012590 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012592 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012594 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595}
12596
Thomas Wouters477c8d52006-05-27 19:21:47 +000012597PyObject *
12598PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12599{
12600 PyObject* str_obj;
12601 PyObject* sep_obj;
12602 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 int kind1, kind2, kind;
12604 void *buf1 = NULL, *buf2 = NULL;
12605 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012606
12607 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012608 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012609 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012610 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012611 if (!sep_obj) {
12612 Py_DECREF(str_obj);
12613 return NULL;
12614 }
12615 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12616 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012617 Py_DECREF(str_obj);
12618 return NULL;
12619 }
12620
Victor Stinner14f8f022011-10-05 20:58:25 +020012621 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012623 kind = Py_MAX(kind1, kind2);
12624 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012626 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 if (!buf1)
12628 goto onError;
12629 buf2 = PyUnicode_DATA(sep_obj);
12630 if (kind2 != kind)
12631 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12632 if (!buf2)
12633 goto onError;
12634 len1 = PyUnicode_GET_LENGTH(str_obj);
12635 len2 = PyUnicode_GET_LENGTH(sep_obj);
12636
Benjamin Petersonead6b532011-12-20 17:23:42 -060012637 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012639 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12640 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12641 else
12642 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 break;
12644 case PyUnicode_2BYTE_KIND:
12645 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12646 break;
12647 case PyUnicode_4BYTE_KIND:
12648 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12649 break;
12650 default:
12651 assert(0);
12652 out = 0;
12653 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012654
12655 Py_DECREF(sep_obj);
12656 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 if (kind1 != kind)
12658 PyMem_Free(buf1);
12659 if (kind2 != kind)
12660 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012661
12662 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 onError:
12664 Py_DECREF(sep_obj);
12665 Py_DECREF(str_obj);
12666 if (kind1 != kind && buf1)
12667 PyMem_Free(buf1);
12668 if (kind2 != kind && buf2)
12669 PyMem_Free(buf2);
12670 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012671}
12672
12673
12674PyObject *
12675PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12676{
12677 PyObject* str_obj;
12678 PyObject* sep_obj;
12679 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 int kind1, kind2, kind;
12681 void *buf1 = NULL, *buf2 = NULL;
12682 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012683
12684 str_obj = PyUnicode_FromObject(str_in);
12685 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012686 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012687 sep_obj = PyUnicode_FromObject(sep_in);
12688 if (!sep_obj) {
12689 Py_DECREF(str_obj);
12690 return NULL;
12691 }
12692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 kind1 = PyUnicode_KIND(str_in);
12694 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012695 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 buf1 = PyUnicode_DATA(str_in);
12697 if (kind1 != kind)
12698 buf1 = _PyUnicode_AsKind(str_in, kind);
12699 if (!buf1)
12700 goto onError;
12701 buf2 = PyUnicode_DATA(sep_obj);
12702 if (kind2 != kind)
12703 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12704 if (!buf2)
12705 goto onError;
12706 len1 = PyUnicode_GET_LENGTH(str_obj);
12707 len2 = PyUnicode_GET_LENGTH(sep_obj);
12708
Benjamin Petersonead6b532011-12-20 17:23:42 -060012709 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012711 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12712 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12713 else
12714 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 break;
12716 case PyUnicode_2BYTE_KIND:
12717 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12718 break;
12719 case PyUnicode_4BYTE_KIND:
12720 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12721 break;
12722 default:
12723 assert(0);
12724 out = 0;
12725 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012726
12727 Py_DECREF(sep_obj);
12728 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 if (kind1 != kind)
12730 PyMem_Free(buf1);
12731 if (kind2 != kind)
12732 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012733
12734 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 onError:
12736 Py_DECREF(sep_obj);
12737 Py_DECREF(str_obj);
12738 if (kind1 != kind && buf1)
12739 PyMem_Free(buf1);
12740 if (kind2 != kind && buf2)
12741 PyMem_Free(buf2);
12742 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012743}
12744
12745PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012746 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012747\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012748Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012749the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012750found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012751
12752static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012753unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012754{
Victor Stinner9310abb2011-10-05 00:59:23 +020012755 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012756}
12757
12758PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012759 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012760\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012761Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012762the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012763separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012764
12765static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012766unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012767{
Victor Stinner9310abb2011-10-05 00:59:23 +020012768 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012769}
12770
Alexander Belopolsky40018472011-02-26 01:02:56 +000012771PyObject *
12772PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012773{
12774 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012775
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012776 s = PyUnicode_FromObject(s);
12777 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012778 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012779 if (sep != NULL) {
12780 sep = PyUnicode_FromObject(sep);
12781 if (sep == NULL) {
12782 Py_DECREF(s);
12783 return NULL;
12784 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012785 }
12786
Victor Stinner9310abb2011-10-05 00:59:23 +020012787 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012788
12789 Py_DECREF(s);
12790 Py_XDECREF(sep);
12791 return result;
12792}
12793
12794PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012795 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012796\n\
12797Return a list of the words in S, using sep as the\n\
12798delimiter string, starting at the end of the string and\n\
12799working to the front. If maxsplit is given, at most maxsplit\n\
12800splits are done. If sep is not specified, any whitespace string\n\
12801is a separator.");
12802
12803static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012804unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012805{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012806 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012807 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012808 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012809
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012810 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12811 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012812 return NULL;
12813
12814 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012816 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012817 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012818 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012819 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012820}
12821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012822PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012823 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824\n\
12825Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012826Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012827is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828
12829static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012830unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012832 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012833 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012835 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12836 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837 return NULL;
12838
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012839 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840}
12841
12842static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012843PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012844{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012845 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846}
12847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012848PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012849 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850\n\
12851Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012852and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853
12854static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012855unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012857 if (PyUnicode_READY(self) == -1)
12858 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012859 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860}
12861
Georg Brandlceee0772007-11-27 23:48:05 +000012862PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012863 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012864\n\
12865Return a translation table usable for str.translate().\n\
12866If there is only one argument, it must be a dictionary mapping Unicode\n\
12867ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012868Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012869If there are two arguments, they must be strings of equal length, and\n\
12870in the resulting dictionary, each character in x will be mapped to the\n\
12871character at the same position in y. If there is a third argument, it\n\
12872must be a string, whose characters will be mapped to None in the result.");
12873
12874static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012875unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012876{
12877 PyObject *x, *y = NULL, *z = NULL;
12878 PyObject *new = NULL, *key, *value;
12879 Py_ssize_t i = 0;
12880 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012881
Georg Brandlceee0772007-11-27 23:48:05 +000012882 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12883 return NULL;
12884 new = PyDict_New();
12885 if (!new)
12886 return NULL;
12887 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012888 int x_kind, y_kind, z_kind;
12889 void *x_data, *y_data, *z_data;
12890
Georg Brandlceee0772007-11-27 23:48:05 +000012891 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012892 if (!PyUnicode_Check(x)) {
12893 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12894 "be a string if there is a second argument");
12895 goto err;
12896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012898 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12899 "arguments must have equal length");
12900 goto err;
12901 }
12902 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012903 x_kind = PyUnicode_KIND(x);
12904 y_kind = PyUnicode_KIND(y);
12905 x_data = PyUnicode_DATA(x);
12906 y_data = PyUnicode_DATA(y);
12907 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12908 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012909 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012910 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012911 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012912 if (!value) {
12913 Py_DECREF(key);
12914 goto err;
12915 }
Georg Brandlceee0772007-11-27 23:48:05 +000012916 res = PyDict_SetItem(new, key, value);
12917 Py_DECREF(key);
12918 Py_DECREF(value);
12919 if (res < 0)
12920 goto err;
12921 }
12922 /* create entries for deleting chars in z */
12923 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012924 z_kind = PyUnicode_KIND(z);
12925 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012926 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012928 if (!key)
12929 goto err;
12930 res = PyDict_SetItem(new, key, Py_None);
12931 Py_DECREF(key);
12932 if (res < 0)
12933 goto err;
12934 }
12935 }
12936 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 int kind;
12938 void *data;
12939
Georg Brandlceee0772007-11-27 23:48:05 +000012940 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012941 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012942 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12943 "to maketrans it must be a dict");
12944 goto err;
12945 }
12946 /* copy entries into the new dict, converting string keys to int keys */
12947 while (PyDict_Next(x, &i, &key, &value)) {
12948 if (PyUnicode_Check(key)) {
12949 /* convert string keys to integer keys */
12950 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012951 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012952 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12953 "table must be of length 1");
12954 goto err;
12955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 kind = PyUnicode_KIND(key);
12957 data = PyUnicode_DATA(key);
12958 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012959 if (!newkey)
12960 goto err;
12961 res = PyDict_SetItem(new, newkey, value);
12962 Py_DECREF(newkey);
12963 if (res < 0)
12964 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012965 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012966 /* just keep integer keys */
12967 if (PyDict_SetItem(new, key, value) < 0)
12968 goto err;
12969 } else {
12970 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12971 "be strings or integers");
12972 goto err;
12973 }
12974 }
12975 }
12976 return new;
12977 err:
12978 Py_DECREF(new);
12979 return NULL;
12980}
12981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012982PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012983 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984\n\
12985Return a copy of the string S, where all characters have been mapped\n\
12986through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012987Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012988Unmapped characters are left untouched. Characters mapped to None\n\
12989are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990
12991static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012995}
12996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012997PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012998 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012999\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013000Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013001
13002static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013003unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013004{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013005 if (PyUnicode_READY(self) == -1)
13006 return NULL;
13007 if (PyUnicode_IS_ASCII(self))
13008 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013009 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013010}
13011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013012PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013013 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013014\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013015Pad a numeric string S with zeros on the left, to fill a field\n\
13016of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013017
13018static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013019unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013021 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013022 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013023 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 int kind;
13025 void *data;
13026 Py_UCS4 chr;
13027
Martin v. Löwis18e16552006-02-15 17:27:45 +000013028 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013029 return NULL;
13030
Benjamin Petersonbac79492012-01-14 13:34:47 -050013031 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013033
Victor Stinnerc4b49542011-12-11 22:44:26 +010013034 if (PyUnicode_GET_LENGTH(self) >= width)
13035 return unicode_result_unchanged(self);
13036
13037 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013038
13039 u = pad(self, fill, 0, '0');
13040
Walter Dörwald068325e2002-04-15 13:36:47 +000013041 if (u == NULL)
13042 return NULL;
13043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044 kind = PyUnicode_KIND(u);
13045 data = PyUnicode_DATA(u);
13046 chr = PyUnicode_READ(kind, data, fill);
13047
13048 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013050 PyUnicode_WRITE(kind, data, 0, chr);
13051 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052 }
13053
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013054 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013055 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013056}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057
13058#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013059static PyObject *
13060unicode__decimal2ascii(PyObject *self)
13061{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013062 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013063}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064#endif
13065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013066PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013067 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013068\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013069Return True if S starts with the specified prefix, False otherwise.\n\
13070With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013071With optional end, stop comparing S at that position.\n\
13072prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073
13074static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013075unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013076 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013078 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013079 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013080 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013081 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013082 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083
Jesus Ceaac451502011-04-20 17:09:23 +020013084 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013085 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013086 if (PyTuple_Check(subobj)) {
13087 Py_ssize_t i;
13088 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013089 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013090 if (substring == NULL)
13091 return NULL;
13092 result = tailmatch(self, substring, start, end, -1);
13093 Py_DECREF(substring);
13094 if (result) {
13095 Py_RETURN_TRUE;
13096 }
13097 }
13098 /* nothing matched */
13099 Py_RETURN_FALSE;
13100 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013101 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013102 if (substring == NULL) {
13103 if (PyErr_ExceptionMatches(PyExc_TypeError))
13104 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13105 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013106 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013107 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013108 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013110 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111}
13112
13113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013114PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013115 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013117Return True if S ends with the specified suffix, False otherwise.\n\
13118With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013119With optional end, stop comparing S at that position.\n\
13120suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121
13122static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013123unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013124 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013126 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013127 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013128 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013129 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013130 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131
Jesus Ceaac451502011-04-20 17:09:23 +020013132 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013134 if (PyTuple_Check(subobj)) {
13135 Py_ssize_t i;
13136 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013137 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013138 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013139 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013140 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013141 result = tailmatch(self, substring, start, end, +1);
13142 Py_DECREF(substring);
13143 if (result) {
13144 Py_RETURN_TRUE;
13145 }
13146 }
13147 Py_RETURN_FALSE;
13148 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013149 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013150 if (substring == NULL) {
13151 if (PyErr_ExceptionMatches(PyExc_TypeError))
13152 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13153 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013155 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013156 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013158 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159}
13160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013162
13163PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013164 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013165\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013166Return a formatted version of S, using substitutions from args and kwargs.\n\
13167The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013168
Eric Smith27bbca62010-11-04 17:06:58 +000013169PyDoc_STRVAR(format_map__doc__,
13170 "S.format_map(mapping) -> str\n\
13171\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013172Return a formatted version of S, using substitutions from mapping.\n\
13173The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013174
Eric Smith4a7d76d2008-05-30 18:10:19 +000013175static PyObject *
13176unicode__format__(PyObject* self, PyObject* args)
13177{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013178 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013179
13180 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13181 return NULL;
13182
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013183 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013184 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013185 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013186}
13187
Eric Smith8c663262007-08-25 02:26:07 +000013188PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013189 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013190\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013191Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013192
13193static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013194unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 Py_ssize_t size;
13197
13198 /* If it's a compact object, account for base structure +
13199 character data. */
13200 if (PyUnicode_IS_COMPACT_ASCII(v))
13201 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13202 else if (PyUnicode_IS_COMPACT(v))
13203 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013204 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 else {
13206 /* If it is a two-block object, account for base object, and
13207 for character block if present. */
13208 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013209 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013210 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013211 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013212 }
13213 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013214 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013215 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013216 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013217 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013218 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013219
13220 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013221}
13222
13223PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013224 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013225
13226static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013227unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013228{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013229 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230 if (!copy)
13231 return NULL;
13232 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013233}
13234
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013236 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013237 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013238 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13239 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013240 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13241 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013242 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013243 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13244 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13245 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13246 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13247 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013248 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013249 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13250 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13251 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013252 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013253 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13254 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13255 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013256 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013257 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013258 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013259 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013260 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13261 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13262 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13263 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13264 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13265 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13266 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13267 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13268 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13269 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13270 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13271 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13272 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13273 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013274 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013275 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013276 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013277 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013278 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013279 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013280 {"maketrans", (PyCFunction) unicode_maketrans,
13281 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013282 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013283#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013284 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013285 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286#endif
13287
Benjamin Peterson14339b62009-01-31 16:36:08 +000013288 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289 {NULL, NULL}
13290};
13291
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013292static PyObject *
13293unicode_mod(PyObject *v, PyObject *w)
13294{
Brian Curtindfc80e32011-08-10 20:28:54 -050013295 if (!PyUnicode_Check(v))
13296 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013297 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013298}
13299
13300static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013301 0, /*nb_add*/
13302 0, /*nb_subtract*/
13303 0, /*nb_multiply*/
13304 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013305};
13306
Guido van Rossumd57fd912000-03-10 22:53:23 +000013307static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013308 (lenfunc) unicode_length, /* sq_length */
13309 PyUnicode_Concat, /* sq_concat */
13310 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13311 (ssizeargfunc) unicode_getitem, /* sq_item */
13312 0, /* sq_slice */
13313 0, /* sq_ass_item */
13314 0, /* sq_ass_slice */
13315 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316};
13317
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013318static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013319unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013320{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013321 if (PyUnicode_READY(self) == -1)
13322 return NULL;
13323
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013324 if (PyIndex_Check(item)) {
13325 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013326 if (i == -1 && PyErr_Occurred())
13327 return NULL;
13328 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013330 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013331 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013332 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013333 PyObject *result;
13334 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013335 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013336 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013338 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013339 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013340 return NULL;
13341 }
13342
13343 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013344 Py_INCREF(unicode_empty);
13345 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013346 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013347 slicelength == PyUnicode_GET_LENGTH(self)) {
13348 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013349 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013350 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013351 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013352 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013353 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013354 src_kind = PyUnicode_KIND(self);
13355 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013356 if (!PyUnicode_IS_ASCII(self)) {
13357 kind_limit = kind_maxchar_limit(src_kind);
13358 max_char = 0;
13359 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13360 ch = PyUnicode_READ(src_kind, src_data, cur);
13361 if (ch > max_char) {
13362 max_char = ch;
13363 if (max_char >= kind_limit)
13364 break;
13365 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013366 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013367 }
Victor Stinner55c99112011-10-13 01:17:06 +020013368 else
13369 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013370 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013371 if (result == NULL)
13372 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013373 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013374 dest_data = PyUnicode_DATA(result);
13375
13376 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013377 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13378 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013379 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013380 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013381 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013382 } else {
13383 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13384 return NULL;
13385 }
13386}
13387
13388static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013389 (lenfunc)unicode_length, /* mp_length */
13390 (binaryfunc)unicode_subscript, /* mp_subscript */
13391 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013392};
13393
Guido van Rossumd57fd912000-03-10 22:53:23 +000013394
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395/* Helpers for PyUnicode_Format() */
13396
13397static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013398getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013399{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013400 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013401 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013402 (*p_argidx)++;
13403 if (arglen < 0)
13404 return args;
13405 else
13406 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013407 }
13408 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013409 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013410 return NULL;
13411}
13412
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013413/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013414
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013415static PyObject *
13416formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013417{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013418 char *p;
13419 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013421
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422 x = PyFloat_AsDouble(v);
13423 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013424 return NULL;
13425
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013427 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013428
Eric Smith0923d1d2009-04-16 20:16:10 +000013429 p = PyOS_double_to_string(x, type, prec,
13430 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013431 if (p == NULL)
13432 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013433 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013434 PyMem_Free(p);
13435 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436}
13437
Tim Peters38fd5b62000-09-21 05:43:11 +000013438static PyObject*
13439formatlong(PyObject *val, int flags, int prec, int type)
13440{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013441 char *buf;
13442 int len;
13443 PyObject *str; /* temporary string object. */
13444 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013445
Benjamin Peterson14339b62009-01-31 16:36:08 +000013446 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13447 if (!str)
13448 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013449 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013450 Py_DECREF(str);
13451 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013452}
13453
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013454static Py_UCS4
13455formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013456{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013457 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013458 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013459 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013460 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013461 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013462 goto onError;
13463 }
13464 else {
13465 /* Integer input truncated to a character */
13466 long x;
13467 x = PyLong_AsLong(v);
13468 if (x == -1 && PyErr_Occurred())
13469 goto onError;
13470
Victor Stinner8faf8212011-12-08 22:14:11 +010013471 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 PyErr_SetString(PyExc_OverflowError,
13473 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013474 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 }
13476
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013477 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013478 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013479
Benjamin Peterson29060642009-01-31 22:14:21 +000013480 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013481 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013482 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013483 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013484}
13485
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013486static int
13487repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13488{
13489 int r;
13490 assert(count > 0);
13491 assert(PyUnicode_Check(obj));
13492 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013493 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013494 if (repeated == NULL)
13495 return -1;
13496 r = _PyAccu_Accumulate(acc, repeated);
13497 Py_DECREF(repeated);
13498 return r;
13499 }
13500 else {
13501 do {
13502 if (_PyAccu_Accumulate(acc, obj))
13503 return -1;
13504 } while (--count);
13505 return 0;
13506 }
13507}
13508
Alexander Belopolsky40018472011-02-26 01:02:56 +000013509PyObject *
13510PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013512 void *fmt;
13513 int fmtkind;
13514 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013515 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013516 int r;
13517 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013518 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013519 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013520 PyObject *temp = NULL;
13521 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013522 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013523 _PyAccu acc;
13524 static PyObject *plus, *minus, *blank, *zero, *percent;
13525
13526 if (!plus && !(plus = get_latin1_char('+')))
13527 return NULL;
13528 if (!minus && !(minus = get_latin1_char('-')))
13529 return NULL;
13530 if (!blank && !(blank = get_latin1_char(' ')))
13531 return NULL;
13532 if (!zero && !(zero = get_latin1_char('0')))
13533 return NULL;
13534 if (!percent && !(percent = get_latin1_char('%')))
13535 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013536
Guido van Rossumd57fd912000-03-10 22:53:23 +000013537 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 PyErr_BadInternalCall();
13539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013540 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013541 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013542 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013543 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013544 if (PyUnicode_READY(uformat) == -1)
13545 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013546 if (_PyAccu_Init(&acc))
13547 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013548 fmt = PyUnicode_DATA(uformat);
13549 fmtkind = PyUnicode_KIND(uformat);
13550 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13551 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013552
Guido van Rossumd57fd912000-03-10 22:53:23 +000013553 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013554 arglen = PyTuple_Size(args);
13555 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013556 }
13557 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013558 arglen = -1;
13559 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013560 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013561 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013562 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013563 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013564
13565 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013566 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013567 PyObject *nonfmt;
13568 Py_ssize_t nonfmtpos;
13569 nonfmtpos = fmtpos++;
13570 while (fmtcnt >= 0 &&
13571 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13572 fmtpos++;
13573 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013574 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013575 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013576 if (nonfmt == NULL)
13577 goto onError;
13578 r = _PyAccu_Accumulate(&acc, nonfmt);
13579 Py_DECREF(nonfmt);
13580 if (r)
13581 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013582 }
13583 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 /* Got a format specifier */
13585 int flags = 0;
13586 Py_ssize_t width = -1;
13587 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013588 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013589 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013590 int isnumok;
13591 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013592 void *pbuf = NULL;
13593 Py_ssize_t pindex, len;
13594 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013596 fmtpos++;
13597 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13598 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013599 Py_ssize_t keylen;
13600 PyObject *key;
13601 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013602
Benjamin Peterson29060642009-01-31 22:14:21 +000013603 if (dict == NULL) {
13604 PyErr_SetString(PyExc_TypeError,
13605 "format requires a mapping");
13606 goto onError;
13607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013608 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013610 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013611 /* Skip over balanced parentheses */
13612 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013613 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013614 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013615 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013616 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013619 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013620 if (fmtcnt < 0 || pcount > 0) {
13621 PyErr_SetString(PyExc_ValueError,
13622 "incomplete format key");
13623 goto onError;
13624 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013625 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013626 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013627 if (key == NULL)
13628 goto onError;
13629 if (args_owned) {
13630 Py_DECREF(args);
13631 args_owned = 0;
13632 }
13633 args = PyObject_GetItem(dict, key);
13634 Py_DECREF(key);
13635 if (args == NULL) {
13636 goto onError;
13637 }
13638 args_owned = 1;
13639 arglen = -1;
13640 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013641 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013642 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013643 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 case '-': flags |= F_LJUST; continue;
13645 case '+': flags |= F_SIGN; continue;
13646 case ' ': flags |= F_BLANK; continue;
13647 case '#': flags |= F_ALT; continue;
13648 case '0': flags |= F_ZERO; continue;
13649 }
13650 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013651 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013652 if (c == '*') {
13653 v = getnextarg(args, arglen, &argidx);
13654 if (v == NULL)
13655 goto onError;
13656 if (!PyLong_Check(v)) {
13657 PyErr_SetString(PyExc_TypeError,
13658 "* wants int");
13659 goto onError;
13660 }
13661 width = PyLong_AsLong(v);
13662 if (width == -1 && PyErr_Occurred())
13663 goto onError;
13664 if (width < 0) {
13665 flags |= F_LJUST;
13666 width = -width;
13667 }
13668 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013669 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013670 }
13671 else if (c >= '0' && c <= '9') {
13672 width = c - '0';
13673 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013674 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013675 if (c < '0' || c > '9')
13676 break;
13677 if ((width*10) / 10 != width) {
13678 PyErr_SetString(PyExc_ValueError,
13679 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013680 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 }
13682 width = width*10 + (c - '0');
13683 }
13684 }
13685 if (c == '.') {
13686 prec = 0;
13687 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013688 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013689 if (c == '*') {
13690 v = getnextarg(args, arglen, &argidx);
13691 if (v == NULL)
13692 goto onError;
13693 if (!PyLong_Check(v)) {
13694 PyErr_SetString(PyExc_TypeError,
13695 "* wants int");
13696 goto onError;
13697 }
13698 prec = PyLong_AsLong(v);
13699 if (prec == -1 && PyErr_Occurred())
13700 goto onError;
13701 if (prec < 0)
13702 prec = 0;
13703 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013704 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 }
13706 else if (c >= '0' && c <= '9') {
13707 prec = c - '0';
13708 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013709 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013710 if (c < '0' || c > '9')
13711 break;
13712 if ((prec*10) / 10 != prec) {
13713 PyErr_SetString(PyExc_ValueError,
13714 "prec too big");
13715 goto onError;
13716 }
13717 prec = prec*10 + (c - '0');
13718 }
13719 }
13720 } /* prec */
13721 if (fmtcnt >= 0) {
13722 if (c == 'h' || c == 'l' || c == 'L') {
13723 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013724 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013725 }
13726 }
13727 if (fmtcnt < 0) {
13728 PyErr_SetString(PyExc_ValueError,
13729 "incomplete format");
13730 goto onError;
13731 }
13732 if (c != '%') {
13733 v = getnextarg(args, arglen, &argidx);
13734 if (v == NULL)
13735 goto onError;
13736 }
13737 sign = 0;
13738 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013739 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013740 switch (c) {
13741
13742 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013743 _PyAccu_Accumulate(&acc, percent);
13744 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013745
13746 case 's':
13747 case 'r':
13748 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013749 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013750 temp = v;
13751 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013752 }
13753 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013754 if (c == 's')
13755 temp = PyObject_Str(v);
13756 else if (c == 'r')
13757 temp = PyObject_Repr(v);
13758 else
13759 temp = PyObject_ASCII(v);
13760 if (temp == NULL)
13761 goto onError;
13762 if (PyUnicode_Check(temp))
13763 /* nothing to do */;
13764 else {
13765 Py_DECREF(temp);
13766 PyErr_SetString(PyExc_TypeError,
13767 "%s argument has non-string str()");
13768 goto onError;
13769 }
13770 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013771 if (PyUnicode_READY(temp) == -1) {
13772 Py_CLEAR(temp);
13773 goto onError;
13774 }
13775 pbuf = PyUnicode_DATA(temp);
13776 kind = PyUnicode_KIND(temp);
13777 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013778 if (prec >= 0 && len > prec)
13779 len = prec;
13780 break;
13781
13782 case 'i':
13783 case 'd':
13784 case 'u':
13785 case 'o':
13786 case 'x':
13787 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013788 isnumok = 0;
13789 if (PyNumber_Check(v)) {
13790 PyObject *iobj=NULL;
13791
13792 if (PyLong_Check(v)) {
13793 iobj = v;
13794 Py_INCREF(iobj);
13795 }
13796 else {
13797 iobj = PyNumber_Long(v);
13798 }
13799 if (iobj!=NULL) {
13800 if (PyLong_Check(iobj)) {
13801 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013802 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013803 Py_DECREF(iobj);
13804 if (!temp)
13805 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013806 if (PyUnicode_READY(temp) == -1) {
13807 Py_CLEAR(temp);
13808 goto onError;
13809 }
13810 pbuf = PyUnicode_DATA(temp);
13811 kind = PyUnicode_KIND(temp);
13812 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013813 sign = 1;
13814 }
13815 else {
13816 Py_DECREF(iobj);
13817 }
13818 }
13819 }
13820 if (!isnumok) {
13821 PyErr_Format(PyExc_TypeError,
13822 "%%%c format: a number is required, "
13823 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13824 goto onError;
13825 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013826 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013827 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013828 fillobj = zero;
13829 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013830 break;
13831
13832 case 'e':
13833 case 'E':
13834 case 'f':
13835 case 'F':
13836 case 'g':
13837 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013838 temp = formatfloat(v, flags, prec, c);
13839 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013840 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013841 if (PyUnicode_READY(temp) == -1) {
13842 Py_CLEAR(temp);
13843 goto onError;
13844 }
13845 pbuf = PyUnicode_DATA(temp);
13846 kind = PyUnicode_KIND(temp);
13847 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013848 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013849 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013851 fillobj = zero;
13852 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013853 break;
13854
13855 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013856 {
13857 Py_UCS4 ch = formatchar(v);
13858 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013859 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013860 temp = _PyUnicode_FromUCS4(&ch, 1);
13861 if (temp == NULL)
13862 goto onError;
13863 pbuf = PyUnicode_DATA(temp);
13864 kind = PyUnicode_KIND(temp);
13865 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013866 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013867 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013868
13869 default:
13870 PyErr_Format(PyExc_ValueError,
13871 "unsupported format character '%c' (0x%x) "
13872 "at index %zd",
13873 (31<=c && c<=126) ? (char)c : '?',
13874 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013875 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013876 goto onError;
13877 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013878 /* pbuf is initialized here. */
13879 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013880 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013881 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13882 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013883 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013884 pindex++;
13885 }
13886 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13887 signobj = plus;
13888 len--;
13889 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013890 }
13891 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013892 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013893 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013894 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013895 else
13896 sign = 0;
13897 }
13898 if (width < len)
13899 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013900 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013901 if (fill != ' ') {
13902 assert(signobj != NULL);
13903 if (_PyAccu_Accumulate(&acc, signobj))
13904 goto onError;
13905 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013906 if (width > len)
13907 width--;
13908 }
13909 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013910 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013911 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013912 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013913 second = get_latin1_char(
13914 PyUnicode_READ(kind, pbuf, pindex + 1));
13915 pindex += 2;
13916 if (second == NULL ||
13917 _PyAccu_Accumulate(&acc, zero) ||
13918 _PyAccu_Accumulate(&acc, second))
13919 goto onError;
13920 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013921 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013922 width -= 2;
13923 if (width < 0)
13924 width = 0;
13925 len -= 2;
13926 }
13927 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013928 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013929 if (repeat_accumulate(&acc, fillobj, width - len))
13930 goto onError;
13931 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 }
13933 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013934 if (sign) {
13935 assert(signobj != NULL);
13936 if (_PyAccu_Accumulate(&acc, signobj))
13937 goto onError;
13938 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013939 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013940 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13941 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013942 second = get_latin1_char(
13943 PyUnicode_READ(kind, pbuf, pindex + 1));
13944 pindex += 2;
13945 if (second == NULL ||
13946 _PyAccu_Accumulate(&acc, zero) ||
13947 _PyAccu_Accumulate(&acc, second))
13948 goto onError;
13949 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013950 }
13951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013952 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013953 if (temp != NULL) {
13954 assert(pbuf == PyUnicode_DATA(temp));
13955 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013956 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013957 else {
13958 const char *p = (const char *) pbuf;
13959 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013960 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013961 v = PyUnicode_FromKindAndData(kind, p, len);
13962 }
13963 if (v == NULL)
13964 goto onError;
13965 r = _PyAccu_Accumulate(&acc, v);
13966 Py_DECREF(v);
13967 if (r)
13968 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013969 if (width > len && repeat_accumulate(&acc, blank, width - len))
13970 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013971 if (dict && (argidx < arglen) && c != '%') {
13972 PyErr_SetString(PyExc_TypeError,
13973 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013974 goto onError;
13975 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013976 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013977 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013978 } /* until end */
13979 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013980 PyErr_SetString(PyExc_TypeError,
13981 "not all arguments converted during string formatting");
13982 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013983 }
13984
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013985 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013986 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013987 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013988 }
13989 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013990 Py_XDECREF(temp);
13991 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013992 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013993
Benjamin Peterson29060642009-01-31 22:14:21 +000013994 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013995 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013996 Py_XDECREF(temp);
13997 Py_XDECREF(second);
13998 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013999 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014000 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014001 }
14002 return NULL;
14003}
14004
Jeremy Hylton938ace62002-07-17 16:30:39 +000014005static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014006unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14007
Tim Peters6d6c1a32001-08-02 04:15:00 +000014008static PyObject *
14009unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14010{
Benjamin Peterson29060642009-01-31 22:14:21 +000014011 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014012 static char *kwlist[] = {"object", "encoding", "errors", 0};
14013 char *encoding = NULL;
14014 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014015
Benjamin Peterson14339b62009-01-31 16:36:08 +000014016 if (type != &PyUnicode_Type)
14017 return unicode_subtype_new(type, args, kwds);
14018 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014019 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014020 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014021 if (x == NULL) {
14022 Py_INCREF(unicode_empty);
14023 return unicode_empty;
14024 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014025 if (encoding == NULL && errors == NULL)
14026 return PyObject_Str(x);
14027 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014028 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014029}
14030
Guido van Rossume023fe02001-08-30 03:12:59 +000014031static PyObject *
14032unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14033{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014034 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014035 Py_ssize_t length, char_size;
14036 int share_wstr, share_utf8;
14037 unsigned int kind;
14038 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014039
Benjamin Peterson14339b62009-01-31 16:36:08 +000014040 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014041
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014042 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014043 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014044 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014045 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014046 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014047 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014048 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014049 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014050
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014051 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014052 if (self == NULL) {
14053 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 return NULL;
14055 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014056 kind = PyUnicode_KIND(unicode);
14057 length = PyUnicode_GET_LENGTH(unicode);
14058
14059 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014060#ifdef Py_DEBUG
14061 _PyUnicode_HASH(self) = -1;
14062#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014063 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014064#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014065 _PyUnicode_STATE(self).interned = 0;
14066 _PyUnicode_STATE(self).kind = kind;
14067 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014068 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014069 _PyUnicode_STATE(self).ready = 1;
14070 _PyUnicode_WSTR(self) = NULL;
14071 _PyUnicode_UTF8_LENGTH(self) = 0;
14072 _PyUnicode_UTF8(self) = NULL;
14073 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014074 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014075
14076 share_utf8 = 0;
14077 share_wstr = 0;
14078 if (kind == PyUnicode_1BYTE_KIND) {
14079 char_size = 1;
14080 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14081 share_utf8 = 1;
14082 }
14083 else if (kind == PyUnicode_2BYTE_KIND) {
14084 char_size = 2;
14085 if (sizeof(wchar_t) == 2)
14086 share_wstr = 1;
14087 }
14088 else {
14089 assert(kind == PyUnicode_4BYTE_KIND);
14090 char_size = 4;
14091 if (sizeof(wchar_t) == 4)
14092 share_wstr = 1;
14093 }
14094
14095 /* Ensure we won't overflow the length. */
14096 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14097 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014098 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014099 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014100 data = PyObject_MALLOC((length + 1) * char_size);
14101 if (data == NULL) {
14102 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014103 goto onError;
14104 }
14105
Victor Stinnerc3c74152011-10-02 20:39:55 +020014106 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014107 if (share_utf8) {
14108 _PyUnicode_UTF8_LENGTH(self) = length;
14109 _PyUnicode_UTF8(self) = data;
14110 }
14111 if (share_wstr) {
14112 _PyUnicode_WSTR_LENGTH(self) = length;
14113 _PyUnicode_WSTR(self) = (wchar_t *)data;
14114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014115
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014116 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014117 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014118 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014119#ifdef Py_DEBUG
14120 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14121#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014122 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014123 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014124
14125onError:
14126 Py_DECREF(unicode);
14127 Py_DECREF(self);
14128 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014129}
14130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014131PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014132 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014133\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014134Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014135encoding defaults to the current default string encoding.\n\
14136errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014137
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014138static PyObject *unicode_iter(PyObject *seq);
14139
Guido van Rossumd57fd912000-03-10 22:53:23 +000014140PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014141 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014142 "str", /* tp_name */
14143 sizeof(PyUnicodeObject), /* tp_size */
14144 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014145 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014146 (destructor)unicode_dealloc, /* tp_dealloc */
14147 0, /* tp_print */
14148 0, /* tp_getattr */
14149 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014150 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014151 unicode_repr, /* tp_repr */
14152 &unicode_as_number, /* tp_as_number */
14153 &unicode_as_sequence, /* tp_as_sequence */
14154 &unicode_as_mapping, /* tp_as_mapping */
14155 (hashfunc) unicode_hash, /* tp_hash*/
14156 0, /* tp_call*/
14157 (reprfunc) unicode_str, /* tp_str */
14158 PyObject_GenericGetAttr, /* tp_getattro */
14159 0, /* tp_setattro */
14160 0, /* tp_as_buffer */
14161 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014162 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014163 unicode_doc, /* tp_doc */
14164 0, /* tp_traverse */
14165 0, /* tp_clear */
14166 PyUnicode_RichCompare, /* tp_richcompare */
14167 0, /* tp_weaklistoffset */
14168 unicode_iter, /* tp_iter */
14169 0, /* tp_iternext */
14170 unicode_methods, /* tp_methods */
14171 0, /* tp_members */
14172 0, /* tp_getset */
14173 &PyBaseObject_Type, /* tp_base */
14174 0, /* tp_dict */
14175 0, /* tp_descr_get */
14176 0, /* tp_descr_set */
14177 0, /* tp_dictoffset */
14178 0, /* tp_init */
14179 0, /* tp_alloc */
14180 unicode_new, /* tp_new */
14181 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014182};
14183
14184/* Initialize the Unicode implementation */
14185
Victor Stinner3a50e702011-10-18 21:21:00 +020014186int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014187{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014188 int i;
14189
Thomas Wouters477c8d52006-05-27 19:21:47 +000014190 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014191 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014192 0x000A, /* LINE FEED */
14193 0x000D, /* CARRIAGE RETURN */
14194 0x001C, /* FILE SEPARATOR */
14195 0x001D, /* GROUP SEPARATOR */
14196 0x001E, /* RECORD SEPARATOR */
14197 0x0085, /* NEXT LINE */
14198 0x2028, /* LINE SEPARATOR */
14199 0x2029, /* PARAGRAPH SEPARATOR */
14200 };
14201
Fred Drakee4315f52000-05-09 19:53:39 +000014202 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014203 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014204 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014205 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014206 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014207
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014208 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014209 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014210 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014211 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014212
14213 /* initialize the linebreak bloom filter */
14214 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014215 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014216 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014217
14218 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014219
14220#ifdef HAVE_MBCS
14221 winver.dwOSVersionInfoSize = sizeof(winver);
14222 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14223 PyErr_SetFromWindowsErr(0);
14224 return -1;
14225 }
14226#endif
14227 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014228}
14229
14230/* Finalize the Unicode implementation */
14231
Christian Heimesa156e092008-02-16 07:38:31 +000014232int
14233PyUnicode_ClearFreeList(void)
14234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014235 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014236}
14237
Guido van Rossumd57fd912000-03-10 22:53:23 +000014238void
Thomas Wouters78890102000-07-22 19:25:51 +000014239_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014240{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014241 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014242
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014243 Py_XDECREF(unicode_empty);
14244 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014245
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014246 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014247 if (unicode_latin1[i]) {
14248 Py_DECREF(unicode_latin1[i]);
14249 unicode_latin1[i] = NULL;
14250 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014251 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014252 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014253 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014254}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014255
Walter Dörwald16807132007-05-25 13:52:07 +000014256void
14257PyUnicode_InternInPlace(PyObject **p)
14258{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014259 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014260 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014261#ifdef Py_DEBUG
14262 assert(s != NULL);
14263 assert(_PyUnicode_CHECK(s));
14264#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014265 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014266 return;
14267#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014268 /* If it's a subclass, we don't really know what putting
14269 it in the interned dict might do. */
14270 if (!PyUnicode_CheckExact(s))
14271 return;
14272 if (PyUnicode_CHECK_INTERNED(s))
14273 return;
14274 if (interned == NULL) {
14275 interned = PyDict_New();
14276 if (interned == NULL) {
14277 PyErr_Clear(); /* Don't leave an exception */
14278 return;
14279 }
14280 }
14281 /* It might be that the GetItem call fails even
14282 though the key is present in the dictionary,
14283 namely when this happens during a stack overflow. */
14284 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014285 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014286 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014287
Benjamin Peterson29060642009-01-31 22:14:21 +000014288 if (t) {
14289 Py_INCREF(t);
14290 Py_DECREF(*p);
14291 *p = t;
14292 return;
14293 }
Walter Dörwald16807132007-05-25 13:52:07 +000014294
Benjamin Peterson14339b62009-01-31 16:36:08 +000014295 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014296 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014297 PyErr_Clear();
14298 PyThreadState_GET()->recursion_critical = 0;
14299 return;
14300 }
14301 PyThreadState_GET()->recursion_critical = 0;
14302 /* The two references in interned are not counted by refcnt.
14303 The deallocator will take care of this */
14304 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014305 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014306}
14307
14308void
14309PyUnicode_InternImmortal(PyObject **p)
14310{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014311 PyUnicode_InternInPlace(p);
14312 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014313 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014314 Py_INCREF(*p);
14315 }
Walter Dörwald16807132007-05-25 13:52:07 +000014316}
14317
14318PyObject *
14319PyUnicode_InternFromString(const char *cp)
14320{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014321 PyObject *s = PyUnicode_FromString(cp);
14322 if (s == NULL)
14323 return NULL;
14324 PyUnicode_InternInPlace(&s);
14325 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014326}
14327
Alexander Belopolsky40018472011-02-26 01:02:56 +000014328void
14329_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014330{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014331 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014332 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014333 Py_ssize_t i, n;
14334 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014335
Benjamin Peterson14339b62009-01-31 16:36:08 +000014336 if (interned == NULL || !PyDict_Check(interned))
14337 return;
14338 keys = PyDict_Keys(interned);
14339 if (keys == NULL || !PyList_Check(keys)) {
14340 PyErr_Clear();
14341 return;
14342 }
Walter Dörwald16807132007-05-25 13:52:07 +000014343
Benjamin Peterson14339b62009-01-31 16:36:08 +000014344 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14345 detector, interned unicode strings are not forcibly deallocated;
14346 rather, we give them their stolen references back, and then clear
14347 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014348
Benjamin Peterson14339b62009-01-31 16:36:08 +000014349 n = PyList_GET_SIZE(keys);
14350 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014351 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014352 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014353 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014354 if (PyUnicode_READY(s) == -1) {
14355 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014356 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014358 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014359 case SSTATE_NOT_INTERNED:
14360 /* XXX Shouldn't happen */
14361 break;
14362 case SSTATE_INTERNED_IMMORTAL:
14363 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014364 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014365 break;
14366 case SSTATE_INTERNED_MORTAL:
14367 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014368 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014369 break;
14370 default:
14371 Py_FatalError("Inconsistent interned string state.");
14372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014373 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014374 }
14375 fprintf(stderr, "total size of all interned strings: "
14376 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14377 "mortal/immortal\n", mortal_size, immortal_size);
14378 Py_DECREF(keys);
14379 PyDict_Clear(interned);
14380 Py_DECREF(interned);
14381 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014382}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014383
14384
14385/********************* Unicode Iterator **************************/
14386
14387typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014388 PyObject_HEAD
14389 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014390 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014391} unicodeiterobject;
14392
14393static void
14394unicodeiter_dealloc(unicodeiterobject *it)
14395{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014396 _PyObject_GC_UNTRACK(it);
14397 Py_XDECREF(it->it_seq);
14398 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014399}
14400
14401static int
14402unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14403{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014404 Py_VISIT(it->it_seq);
14405 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014406}
14407
14408static PyObject *
14409unicodeiter_next(unicodeiterobject *it)
14410{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014411 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014412
Benjamin Peterson14339b62009-01-31 16:36:08 +000014413 assert(it != NULL);
14414 seq = it->it_seq;
14415 if (seq == NULL)
14416 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014417 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014419 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14420 int kind = PyUnicode_KIND(seq);
14421 void *data = PyUnicode_DATA(seq);
14422 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14423 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014424 if (item != NULL)
14425 ++it->it_index;
14426 return item;
14427 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014428
Benjamin Peterson14339b62009-01-31 16:36:08 +000014429 Py_DECREF(seq);
14430 it->it_seq = NULL;
14431 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014432}
14433
14434static PyObject *
14435unicodeiter_len(unicodeiterobject *it)
14436{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014437 Py_ssize_t len = 0;
14438 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014439 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014440 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014441}
14442
14443PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14444
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014445static PyObject *
14446unicodeiter_reduce(unicodeiterobject *it)
14447{
14448 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014449 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014450 it->it_seq, it->it_index);
14451 } else {
14452 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14453 if (u == NULL)
14454 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014455 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014456 }
14457}
14458
14459PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14460
14461static PyObject *
14462unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14463{
14464 Py_ssize_t index = PyLong_AsSsize_t(state);
14465 if (index == -1 && PyErr_Occurred())
14466 return NULL;
14467 if (index < 0)
14468 index = 0;
14469 it->it_index = index;
14470 Py_RETURN_NONE;
14471}
14472
14473PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14474
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014475static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014476 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014477 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014478 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14479 reduce_doc},
14480 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14481 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014482 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014483};
14484
14485PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014486 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14487 "str_iterator", /* tp_name */
14488 sizeof(unicodeiterobject), /* tp_basicsize */
14489 0, /* tp_itemsize */
14490 /* methods */
14491 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14492 0, /* tp_print */
14493 0, /* tp_getattr */
14494 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014495 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014496 0, /* tp_repr */
14497 0, /* tp_as_number */
14498 0, /* tp_as_sequence */
14499 0, /* tp_as_mapping */
14500 0, /* tp_hash */
14501 0, /* tp_call */
14502 0, /* tp_str */
14503 PyObject_GenericGetAttr, /* tp_getattro */
14504 0, /* tp_setattro */
14505 0, /* tp_as_buffer */
14506 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14507 0, /* tp_doc */
14508 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14509 0, /* tp_clear */
14510 0, /* tp_richcompare */
14511 0, /* tp_weaklistoffset */
14512 PyObject_SelfIter, /* tp_iter */
14513 (iternextfunc)unicodeiter_next, /* tp_iternext */
14514 unicodeiter_methods, /* tp_methods */
14515 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014516};
14517
14518static PyObject *
14519unicode_iter(PyObject *seq)
14520{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014521 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014522
Benjamin Peterson14339b62009-01-31 16:36:08 +000014523 if (!PyUnicode_Check(seq)) {
14524 PyErr_BadInternalCall();
14525 return NULL;
14526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014527 if (PyUnicode_READY(seq) == -1)
14528 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014529 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14530 if (it == NULL)
14531 return NULL;
14532 it->it_index = 0;
14533 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014534 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014535 _PyObject_GC_TRACK(it);
14536 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014537}
14538
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014539
14540size_t
14541Py_UNICODE_strlen(const Py_UNICODE *u)
14542{
14543 int res = 0;
14544 while(*u++)
14545 res++;
14546 return res;
14547}
14548
14549Py_UNICODE*
14550Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14551{
14552 Py_UNICODE *u = s1;
14553 while ((*u++ = *s2++));
14554 return s1;
14555}
14556
14557Py_UNICODE*
14558Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14559{
14560 Py_UNICODE *u = s1;
14561 while ((*u++ = *s2++))
14562 if (n-- == 0)
14563 break;
14564 return s1;
14565}
14566
14567Py_UNICODE*
14568Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14569{
14570 Py_UNICODE *u1 = s1;
14571 u1 += Py_UNICODE_strlen(u1);
14572 Py_UNICODE_strcpy(u1, s2);
14573 return s1;
14574}
14575
14576int
14577Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14578{
14579 while (*s1 && *s2 && *s1 == *s2)
14580 s1++, s2++;
14581 if (*s1 && *s2)
14582 return (*s1 < *s2) ? -1 : +1;
14583 if (*s1)
14584 return 1;
14585 if (*s2)
14586 return -1;
14587 return 0;
14588}
14589
14590int
14591Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14592{
14593 register Py_UNICODE u1, u2;
14594 for (; n != 0; n--) {
14595 u1 = *s1;
14596 u2 = *s2;
14597 if (u1 != u2)
14598 return (u1 < u2) ? -1 : +1;
14599 if (u1 == '\0')
14600 return 0;
14601 s1++;
14602 s2++;
14603 }
14604 return 0;
14605}
14606
14607Py_UNICODE*
14608Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14609{
14610 const Py_UNICODE *p;
14611 for (p = s; *p; p++)
14612 if (*p == c)
14613 return (Py_UNICODE*)p;
14614 return NULL;
14615}
14616
14617Py_UNICODE*
14618Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14619{
14620 const Py_UNICODE *p;
14621 p = s + Py_UNICODE_strlen(s);
14622 while (p != s) {
14623 p--;
14624 if (*p == c)
14625 return (Py_UNICODE*)p;
14626 }
14627 return NULL;
14628}
Victor Stinner331ea922010-08-10 16:37:20 +000014629
Victor Stinner71133ff2010-09-01 23:43:53 +000014630Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014631PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014632{
Victor Stinner577db2c2011-10-11 22:12:48 +020014633 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014634 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014636 if (!PyUnicode_Check(unicode)) {
14637 PyErr_BadArgument();
14638 return NULL;
14639 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014640 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014641 if (u == NULL)
14642 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014643 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014644 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014645 PyErr_NoMemory();
14646 return NULL;
14647 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014648 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014649 size *= sizeof(Py_UNICODE);
14650 copy = PyMem_Malloc(size);
14651 if (copy == NULL) {
14652 PyErr_NoMemory();
14653 return NULL;
14654 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014655 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014656 return copy;
14657}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014658
Georg Brandl66c221e2010-10-14 07:04:07 +000014659/* A _string module, to export formatter_parser and formatter_field_name_split
14660 to the string.Formatter class implemented in Python. */
14661
14662static PyMethodDef _string_methods[] = {
14663 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14664 METH_O, PyDoc_STR("split the argument as a field name")},
14665 {"formatter_parser", (PyCFunction) formatter_parser,
14666 METH_O, PyDoc_STR("parse the argument as a format string")},
14667 {NULL, NULL}
14668};
14669
14670static struct PyModuleDef _string_module = {
14671 PyModuleDef_HEAD_INIT,
14672 "_string",
14673 PyDoc_STR("string helper module"),
14674 0,
14675 _string_methods,
14676 NULL,
14677 NULL,
14678 NULL,
14679 NULL
14680};
14681
14682PyMODINIT_FUNC
14683PyInit__string(void)
14684{
14685 return PyModule_Create(&_string_module);
14686}
14687
14688
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014689#ifdef __cplusplus
14690}
14691#endif